namei.c source code [linux/fs/namei.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* linux/fs/namei.c
4	*
5	* Copyright (C) 1991, 1992 Linus Torvalds
6	*/
7
8	/*
9	* Some corrections by tytso.
10	*/
11
12	/ [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname*
13	* lookup logic.
14	*/
15	/ [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.*
16	*/
17
18	#include <linux/init.h>
19	#include <linux/export.h>
20	#include <linux/slab.h>
21	#include <linux/wordpart.h>
22	#include <linux/fs.h>
23	#include <linux/filelock.h>
24	#include <linux/namei.h>
25	#include <linux/pagemap.h>
26	#include <linux/sched/mm.h>
27	#include <linux/fsnotify.h>
28	#include <linux/personality.h>
29	#include <linux/security.h>
30	#include <linux/syscalls.h>
31	#include <linux/mount.h>
32	#include <linux/audit.h>
33	#include <linux/capability.h>
34	#include <linux/file.h>
35	#include <linux/fcntl.h>
36	#include <linux/device_cgroup.h>
37	#include <linux/fs_struct.h>
38	#include <linux/posix_acl.h>
39	#include <linux/hash.h>
40	#include <linux/bitops.h>
41	#include <linux/init_task.h>
42	#include <linux/uaccess.h>
43
44	#include "internal.h"
45	#include "mount.h"
46
47	/ [Feb-1997 T. Schoebel-Theuer]*
48	* Fundamental changes in the pathname lookup mechanisms (namei)
49	* were necessary because of omirr. The reason is that omirr needs
50	* to know the _real_ pathname, not the user-supplied one, in case
51	* of symlinks (and also when transname replacements occur).
52	*
53	* The new code replaces the old recursive symlink resolution with
54	* an iterative one (in case of non-nested symlink chains). It does
55	* this with calls to <fs>_follow_link().
56	* As a side effect, dir_namei(), _namei() and follow_link() are now
57	* replaced with a single function lookup_dentry() that can handle all
58	* the special cases of the former code.
59	*
60	* With the new dcache, the pathname is stored at each inode, at least as
61	* long as the refcount of the inode is positive. As a side effect, the
62	* size of the dcache depends on the inode cache and thus is dynamic.
63	*
64	* [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
65	* resolution to correspond with current state of the code.
66	*
67	* Note that the symlink resolution is not completely iterative.
68	* There is still a significant amount of tail- and mid- recursion in
69	* the algorithm. Also, note that <fs>_readlink() is not used in
70	* lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
71	* may return different results than <fs>_follow_link(). Many virtual
72	* filesystems (including /proc) exhibit this behavior.
73	*/
74
75	/ [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:*
76	* New symlink semantics: when open() is called with flags O_CREAT \| O_EXCL
77	* and the name already exists in form of a symlink, try to create the new
78	* name indicated by the symlink. The old code always complained that the
79	* name already exists, due to not following the symlink even if its target
80	* is nonexistent. The new semantics affects also mknod() and link() when
81	* the name is a symlink pointing to a non-existent name.
82	*
83	* I don't know which semantics is the right one, since I have no access
84	* to standards. But I found by trial that HP-UX 9.0 has the full "new"
85	* semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
86	* "old" one. Personally, I think the new semantics is much more logical.
87	* Note that "ln old new" where "new" is a symlink pointing to a non-existing
88	* file does succeed in both HP-UX and SunOs, but not in Solaris
89	* and in the old Linux semantics.
90	*/
91
92	/ [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink*
93	* semantics. See the comments in "open_namei" and "do_link" below.
94	*
95	* [10-Sep-98 Alan Modra] Another symlink change.
96	*/
97
98	/ [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:*
99	* inside the path - always follow.
100	* in the last component in creation/removal/renaming - never follow.
101	* if LOOKUP_FOLLOW passed - follow.
102	* if the pathname has trailing slashes - follow.
103	* otherwise - don't follow.
104	* (applied in that order).
105	*
106	* [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
107	* restored for 2.4. This is the last surviving part of old 4.2BSD bug.
108	* During the 2.4 we need to fix the userland stuff depending on it -
109	* hopefully we will be able to get rid of that wart in 2.5. So far only
110	* XEmacs seems to be relying on it...
111	*/
112	/*
113	* [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
114	* implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives
115	* any extra contention...
116	*/
117
118	/ In order to reduce some races, while at the same time doing additional*
119	* checking and hopefully speeding things up, we copy filenames to the
120	* kernel data space before using them..
121	*
122	* POSIX.1 2.4: an empty pathname is invalid (ENOENT).
123	* PATH_MAX includes the nul terminator --RR.
124	*/
125
126	#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname))
127
128	struct filename *
129	getname_flags(const char __user filename, int* flags, int *empty)
130	{
131	struct filename *result;
132	char *kname;
133	int len;
134
135	result = audit_reusename(name: filename);
136	if (result)
137	return result;
138
139	result = __getname();
140	if (unlikely(!result))
141	return ERR_PTR(error: -ENOMEM);
142
143	/*
144	* First, try to embed the struct filename inside the names_cache
145	* allocation
146	*/
147	kname = (char *)result->iname;
148	result->name = kname;
149
150	len = strncpy_from_user(dst: kname, src: filename, EMBEDDED_NAME_MAX);
151	if (unlikely(len < `0`)) {
152	__putname(result);
153	return ERR_PTR(error: len);
154	}
155
156	/*
157	* Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
158	* separate struct filename so we can dedicate the entire
159	* names_cache allocation for the pathname, and re-do the copy from
160	* userland.
161	*/
162	if (unlikely(len == EMBEDDED_NAME_MAX)) {
163	const size_t size = offsetof(struct filename, iname[`1`]);
164	kname = (char *)result;
165
166	/*
167	* size is chosen that way we to guarantee that
168	* result->iname[0] is within the same object and that
169	* kname can't be equal to result->iname, no matter what.
170	*/
171	result = kzalloc(size, GFP_KERNEL);
172	if (unlikely(!result)) {
173	__putname(kname);
174	return ERR_PTR(error: -ENOMEM);
175	}
176	result->name = kname;
177	len = strncpy_from_user(dst: kname, src: filename, PATH_MAX);
178	if (unlikely(len < `0`)) {
179	__putname(kname);
180	kfree(objp: result);
181	return ERR_PTR(error: len);
182	}
183	if (unlikely(len == PATH_MAX)) {
184	__putname(kname);
185	kfree(objp: result);
186	return ERR_PTR(error: -ENAMETOOLONG);
187	}
188	}
189
190	atomic_set(v: &result->refcnt, i: `1`);
191	/ The empty path is special. /
192	if (unlikely(!len)) {
193	if (empty)
194	*empty = `1`;
195	if (!(flags & LOOKUP_EMPTY)) {
196	putname(name: result);
197	return ERR_PTR(error: -ENOENT);
198	}
199	}
200
201	result->uptr = filename;
202	result->aname = NULL;
203	audit_getname(name: result);
204	return result;
205	}
206
207	struct filename *
208	getname_uflags(const char __user filename, int* uflags)
209	{
210	int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : `0`;
211
212	return getname_flags(filename, flags, NULL);
213	}
214
215	struct filename *
216	getname(const char __user * filename)
217	{
218	return getname_flags(filename, flags: `0`, NULL);
219	}
220
221	struct filename *
222	getname_kernel(const char * filename)
223	{
224	struct filename *result;
225	int len = strlen(filename) + `1`;
226
227	result = __getname();
228	if (unlikely(!result))
229	return ERR_PTR(error: -ENOMEM);
230
231	if (len <= EMBEDDED_NAME_MAX) {
232	result->name = (char *)result->iname;
233	} else if (len <= PATH_MAX) {
234	const size_t size = offsetof(struct filename, iname[`1`]);
235	struct filename *tmp;
236
237	tmp = kmalloc(size, GFP_KERNEL);
238	if (unlikely(!tmp)) {
239	__putname(result);
240	return ERR_PTR(error: -ENOMEM);
241	}
242	tmp->name = (char *)result;
243	result = tmp;
244	} else {
245	__putname(result);
246	return ERR_PTR(error: -ENAMETOOLONG);
247	}
248	memcpy((char *)result->name, filename, len);
249	result->uptr = NULL;
250	result->aname = NULL;
251	atomic_set(v: &result->refcnt, i: `1`);
252	audit_getname(name: result);
253
254	return result;
255	}
256	EXPORT_SYMBOL(getname_kernel);
257
258	void putname(struct filename *name)
259	{
260	if (IS_ERR(ptr: name))
261	return;
262
263	if (WARN_ON_ONCE(!atomic_read(&name->refcnt)))
264	return;
265
266	if (!atomic_dec_and_test(v: &name->refcnt))
267	return;
268
269	if (name->name != name->iname) {
270	__putname(name->name);
271	kfree(objp: name);
272	} else
273	__putname(name);
274	}
275	EXPORT_SYMBOL(putname);
276
277	/**
278	* check_acl - perform ACL permission checking
279	* @idmap: idmap of the mount the inode was found from
280	* @inode: inode to check permissions on
281	* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
282	*
283	* This function performs the ACL permission checking. Since this function
284	* retrieve POSIX acls it needs to know whether it is called from a blocking or
285	* non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
286	*
287	* If the inode has been found through an idmapped mount the idmap of
288	* the vfsmount must be passed through @idmap. This function will then take
289	* care to map the inode according to @idmap before checking permissions.
290	* On non-idmapped mounts or if permission checking is to be performed on the
291	* raw inode simply pass @nop_mnt_idmap.
292	*/
293	static int check_acl(struct mnt_idmap *idmap,
294	struct inode inode, int* mask)
295	{
296	#ifdef CONFIG_FS_POSIX_ACL
297	struct posix_acl *acl;
298
299	if (mask & MAY_NOT_BLOCK) {
300	acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
301	if (!acl)
302	return -EAGAIN;
303	/ no ->get_inode_acl() calls in RCU mode... /
304	if (is_uncached_acl(acl))
305	return -ECHILD;
306	return posix_acl_permission(idmap, inode, acl, mask);
307	}
308
309	acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
310	if (IS_ERR(ptr: acl))
311	return PTR_ERR(ptr: acl);
312	if (acl) {
313	int error = posix_acl_permission(idmap, inode, acl, mask);
314	posix_acl_release(acl);
315	return error;
316	}
317	#endif
318
319	return -EAGAIN;
320	}
321
322	/**
323	* acl_permission_check - perform basic UNIX permission checking
324	* @idmap: idmap of the mount the inode was found from
325	* @inode: inode to check permissions on
326	* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
327	*
328	* This function performs the basic UNIX permission checking. Since this
329	* function may retrieve POSIX acls it needs to know whether it is called from a
330	* blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
331	*
332	* If the inode has been found through an idmapped mount the idmap of
333	* the vfsmount must be passed through @idmap. This function will then take
334	* care to map the inode according to @idmap before checking permissions.
335	* On non-idmapped mounts or if permission checking is to be performed on the
336	* raw inode simply pass @nop_mnt_idmap.
337	*/
338	static int acl_permission_check(struct mnt_idmap *idmap,
339	struct inode inode, int* mask)
340	{
341	unsigned int mode = inode->i_mode;
342	vfsuid_t vfsuid;
343
344	/ Are we the owner? If so, ACL's don't matter /
345	vfsuid = i_uid_into_vfsuid(idmap, inode);
346	if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) {
347	mask &= `7`;
348	mode >>= `6`;
349	return (mask & ~mode) ? -EACCES : `0`;
350	}
351
352	/ Do we have ACL's? /
353	if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
354	int error = check_acl(idmap, inode, mask);
355	if (error != -EAGAIN)
356	return error;
357	}
358
359	/ Only RWX matters for group/other mode bits /
360	mask &= `7`;
361
362	/*
363	* Are the group permissions different from
364	* the other permissions in the bits we care
365	* about? Need to check group ownership if so.
366	*/
367	if (mask & (mode ^ (mode >> `3`))) {
368	vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
369	if (vfsgid_in_group_p(vfsgid))
370	mode >>= `3`;
371	}
372
373	/ Bits in 'mode' clear that we require? /
374	return (mask & ~mode) ? -EACCES : `0`;
375	}
376
377	/**
378	* generic_permission - check for access rights on a Posix-like filesystem
379	* @idmap: idmap of the mount the inode was found from
380	* @inode: inode to check access rights for
381	* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
382	* %MAY_NOT_BLOCK ...)
383	*
384	* Used to check for read/write/execute permissions on a file.
385	* We use "fsuid" for this, letting us set arbitrary permissions
386	* for filesystem access without changing the "normal" uids which
387	* are used for other things.
388	*
389	* generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
390	* request cannot be satisfied (eg. requires blocking or too much complexity).
391	* It would then be called again in ref-walk mode.
392	*
393	* If the inode has been found through an idmapped mount the idmap of
394	* the vfsmount must be passed through @idmap. This function will then take
395	* care to map the inode according to @idmap before checking permissions.
396	* On non-idmapped mounts or if permission checking is to be performed on the
397	* raw inode simply pass @nop_mnt_idmap.
398	*/
399	int generic_permission(struct mnt_idmap idmap, struct* inode *inode,
400	int mask)
401	{
402	int ret;
403
404	/*
405	* Do the basic permission checks.
406	*/
407	ret = acl_permission_check(idmap, inode, mask);
408	if (ret != -EACCES)
409	return ret;
410
411	if (S_ISDIR(inode->i_mode)) {
412	/ DACs are overridable for directories /
413	if (!(mask & MAY_WRITE))
414	if (capable_wrt_inode_uidgid(idmap, inode,
415	CAP_DAC_READ_SEARCH))
416	return `0`;
417	if (capable_wrt_inode_uidgid(idmap, inode,
418	CAP_DAC_OVERRIDE))
419	return `0`;
420	return -EACCES;
421	}
422
423	/*
424	* Searching includes executable on directories, else just read.
425	*/
426	mask &= MAY_READ \| MAY_WRITE \| MAY_EXEC;
427	if (mask == MAY_READ)
428	if (capable_wrt_inode_uidgid(idmap, inode,
429	CAP_DAC_READ_SEARCH))
430	return `0`;
431	/*
432	* Read/write DACs are always overridable.
433	* Executable DACs are overridable when there is
434	* at least one exec bit set.
435	*/
436	if (!(mask & MAY_EXEC) \|\| (inode->i_mode & S_IXUGO))
437	if (capable_wrt_inode_uidgid(idmap, inode,
438	CAP_DAC_OVERRIDE))
439	return `0`;
440
441	return -EACCES;
442	}
443	EXPORT_SYMBOL(generic_permission);
444
445	/**
446	* do_inode_permission - UNIX permission checking
447	* @idmap: idmap of the mount the inode was found from
448	* @inode: inode to check permissions on
449	* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
450	*
451	* We _really_ want to just do "generic_permission()" without
452	* even looking at the inode->i_op values. So we keep a cache
453	* flag in inode->i_opflags, that says "this has not special
454	* permission function, use the fast case".
455	*/
456	static inline int do_inode_permission(struct mnt_idmap *idmap,
457	struct inode inode, int* mask)
458	{
459	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
460	if (likely(inode->i_op->permission))
461	return inode->i_op->permission(idmap, inode, mask);
462
463	/ This gets set once for the inode lifetime /
464	spin_lock(lock: &inode->i_lock);
465	inode->i_opflags \|= IOP_FASTPERM;
466	spin_unlock(lock: &inode->i_lock);
467	}
468	return generic_permission(idmap, inode, mask);
469	}
470
471	/**
472	* sb_permission - Check superblock-level permissions
473	* @sb: Superblock of inode to check permission on
474	* @inode: Inode to check permission on
475	* @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
476	*
477	* Separate out file-system wide checks from inode-specific permission checks.
478	*/
479	static int sb_permission(struct super_block sb, struct* inode inode, int* mask)
480	{
481	if (unlikely(mask & MAY_WRITE)) {
482	umode_t mode = inode->i_mode;
483
484	/ Nobody gets write access to a read-only fs. /
485	if (sb_rdonly(sb) && (S_ISREG(mode) \|\| S_ISDIR(mode) \|\| S_ISLNK(mode)))
486	return -EROFS;
487	}
488	return `0`;
489	}
490
491	/**
492	* inode_permission - Check for access rights to a given inode
493	* @idmap: idmap of the mount the inode was found from
494	* @inode: Inode to check permission on
495	* @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
496	*
497	* Check for read/write/execute permissions on an inode. We use fs[ug]id for
498	* this, letting us set arbitrary permissions for filesystem access without
499	* changing the "normal" UIDs which are used for other things.
500	*
501	* When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
502	*/
503	int inode_permission(struct mnt_idmap *idmap,
504	struct inode inode, int* mask)
505	{
506	int retval;
507
508	retval = sb_permission(sb: inode->i_sb, inode, mask);
509	if (retval)
510	return retval;
511
512	if (unlikely(mask & MAY_WRITE)) {
513	/*
514	* Nobody gets write access to an immutable file.
515	*/
516	if (IS_IMMUTABLE(inode))
517	return -EPERM;
518
519	/*
520	* Updating mtime will likely cause i_uid and i_gid to be
521	* written back improperly if their true value is unknown
522	* to the vfs.
523	*/
524	if (HAS_UNMAPPED_ID(idmap, inode))
525	return -EACCES;
526	}
527
528	retval = do_inode_permission(idmap, inode, mask);
529	if (retval)
530	return retval;
531
532	retval = devcgroup_inode_permission(inode, mask);
533	if (retval)
534	return retval;
535
536	return security_inode_permission(inode, mask);
537	}
538	EXPORT_SYMBOL(inode_permission);
539
540	/**
541	* path_get - get a reference to a path
542	* @path: path to get the reference to
543	*
544	* Given a path increment the reference count to the dentry and the vfsmount.
545	*/
546	void path_get(const struct path *path)
547	{
548	mntget(mnt: path->mnt);
549	dget(dentry: path->dentry);
550	}
551	EXPORT_SYMBOL(path_get);
552
553	/**
554	* path_put - put a reference to a path
555	* @path: path to put the reference to
556	*
557	* Given a path decrement the reference count to the dentry and the vfsmount.
558	*/
559	void path_put(const struct path *path)
560	{
561	dput(path->dentry);
562	mntput(mnt: path->mnt);
563	}
564	EXPORT_SYMBOL(path_put);
565
566	#define EMBEDDED_LEVELS 2
567	struct nameidata {
568	struct path path;
569	struct qstr last;
570	struct path root;
571	struct inode inode; /* path.dentry.d_inode /
572	unsigned int flags, state;
573	unsigned seq, next_seq, m_seq, r_seq;
574	int last_type;
575	unsigned depth;
576	int total_link_count;
577	struct saved {
578	struct path link;
579	struct delayed_call done;
580	const char *name;
581	unsigned seq;
582	} *stack, internal[EMBEDDED_LEVELS];
583	struct filename *name;
584	struct nameidata *saved;
585	unsigned root_seq;
586	int dfd;
587	vfsuid_t dir_vfsuid;
588	umode_t dir_mode;
589	} __randomize_layout;
590
591	#define ND_ROOT_PRESET 1
592	#define ND_ROOT_GRABBED 2
593	#define ND_JUMPED 4
594
595	static void __set_nameidata(struct nameidata p, int* dfd, struct filename *name)
596	{
597	struct nameidata *old = current->nameidata;
598	p->stack = p->internal;
599	p->depth = `0`;
600	p->dfd = dfd;
601	p->name = name;
602	p->path.mnt = NULL;
603	p->path.dentry = NULL;
604	p->total_link_count = old ? old->total_link_count : `0`;
605	p->saved = old;
606	current->nameidata = p;
607	}
608
609	static inline void set_nameidata(struct nameidata p, int* dfd, struct filename *name,
610	const struct path *root)
611	{
612	__set_nameidata(p, dfd, name);
613	p->state = `0`;
614	if (unlikely(root)) {
615	p->state = ND_ROOT_PRESET;
616	p->root = *root;
617	}
618	}
619
620	static void restore_nameidata(void)
621	{
622	struct nameidata now = current->nameidata, old = now->saved;
623
624	current->nameidata = old;
625	if (old)
626	old->total_link_count = now->total_link_count;
627	if (now->stack != now->internal)
628	kfree(objp: now->stack);
629	}
630
631	static bool nd_alloc_stack(struct nameidata *nd)
632	{
633	struct saved *p;
634
635	p= kmalloc_array(MAXSYMLINKS, size: sizeof(struct saved),
636	flags: nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
637	if (unlikely(!p))
638	return false;
639	memcpy(p, nd->internal, sizeof(nd->internal));
640	nd->stack = p;
641	return true;
642	}
643
644	/**
645	* path_connected - Verify that a dentry is below mnt.mnt_root
646	* @mnt: The mountpoint to check.
647	* @dentry: The dentry to check.
648	*
649	* Rename can sometimes move a file or directory outside of a bind
650	* mount, path_connected allows those cases to be detected.
651	*/
652	static bool path_connected(struct vfsmount mnt, struct* dentry *dentry)
653	{
654	struct super_block *sb = mnt->mnt_sb;
655
656	/ Bind mounts can have disconnected paths /
657	if (mnt->mnt_root == sb->s_root)
658	return true;
659
660	return is_subdir(dentry, mnt->mnt_root);
661	}
662
663	static void drop_links(struct nameidata *nd)
664	{
665	int i = nd->depth;
666	while (i--) {
667	struct saved *last = nd->stack + i;
668	do_delayed_call(call: &last->done);
669	clear_delayed_call(call: &last->done);
670	}
671	}
672
673	static void leave_rcu(struct nameidata *nd)
674	{
675	nd->flags &= ~LOOKUP_RCU;
676	nd->seq = nd->next_seq = `0`;
677	rcu_read_unlock();
678	}
679
680	static void terminate_walk(struct nameidata *nd)
681	{
682	drop_links(nd);
683	if (!(nd->flags & LOOKUP_RCU)) {
684	int i;
685	path_put(&nd->path);
686	for (i = `0`; i < nd->depth; i++)
687	path_put(&nd->stack[i].link);
688	if (nd->state & ND_ROOT_GRABBED) {
689	path_put(&nd->root);
690	nd->state &= ~ND_ROOT_GRABBED;
691	}
692	} else {
693	leave_rcu(nd);
694	}
695	nd->depth = `0`;
696	nd->path.mnt = NULL;
697	nd->path.dentry = NULL;
698	}
699
700	/ path_put is needed afterwards regardless of success or failure /
701	static bool __legitimize_path(struct path path, unsigned* seq, unsigned mseq)
702	{
703	int res = __legitimize_mnt(path->mnt, mseq);
704	if (unlikely(res)) {
705	if (res > `0`)
706	path->mnt = NULL;
707	path->dentry = NULL;
708	return false;
709	}
710	if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
711	path->dentry = NULL;
712	return false;
713	}
714	return !read_seqcount_retry(&path->dentry->d_seq, seq);
715	}
716
717	static inline bool legitimize_path(struct nameidata *nd,
718	struct path path, unsigned* seq)
719	{
720	return __legitimize_path(path, seq, mseq: nd->m_seq);
721	}
722
723	static bool legitimize_links(struct nameidata *nd)
724	{
725	int i;
726	if (unlikely(nd->flags & LOOKUP_CACHED)) {
727	drop_links(nd);
728	nd->depth = `0`;
729	return false;
730	}
731	for (i = `0`; i < nd->depth; i++) {
732	struct saved *last = nd->stack + i;
733	if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
734	drop_links(nd);
735	nd->depth = i + `1`;
736	return false;
737	}
738	}
739	return true;
740	}
741
742	static bool legitimize_root(struct nameidata *nd)
743	{
744	/ Nothing to do if nd->root is zero or is managed by the VFS user. /
745	if (!nd->root.mnt \|\| (nd->state & ND_ROOT_PRESET))
746	return true;
747	nd->state \|= ND_ROOT_GRABBED;
748	return legitimize_path(nd, path: &nd->root, seq: nd->root_seq);
749	}
750
751	/*
752	* Path walking has 2 modes, rcu-walk and ref-walk (see
753	* Documentation/filesystems/path-lookup.txt). In situations when we can't
754	* continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
755	* normal reference counts on dentries and vfsmounts to transition to ref-walk
756	* mode. Refcounts are grabbed at the last known good point before rcu-walk
757	* got stuck, so ref-walk may continue from there. If this is not successful
758	* (eg. a seqcount has changed), then failure is returned and it's up to caller
759	* to restart the path walk from the beginning in ref-walk mode.
760	*/
761
762	/**
763	* try_to_unlazy - try to switch to ref-walk mode.
764	* @nd: nameidata pathwalk data
765	* Returns: true on success, false on failure
766	*
767	* try_to_unlazy attempts to legitimize the current nd->path and nd->root
768	* for ref-walk mode.
769	* Must be called from rcu-walk context.
770	* Nothing should touch nameidata between try_to_unlazy() failure and
771	* terminate_walk().
772	*/
773	static bool try_to_unlazy(struct nameidata *nd)
774	{
775	struct dentry *parent = nd->path.dentry;
776
777	BUG_ON(!(nd->flags & LOOKUP_RCU));
778
779	if (unlikely(!legitimize_links(nd)))
780	goto out1;
781	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
782	goto out;
783	if (unlikely(!legitimize_root(nd)))
784	goto out;
785	leave_rcu(nd);
786	BUG_ON(nd->inode != parent->d_inode);
787	return true;
788
789	out1:
790	nd->path.mnt = NULL;
791	nd->path.dentry = NULL;
792	out:
793	leave_rcu(nd);
794	return false;
795	}
796
797	/**
798	* try_to_unlazy_next - try to switch to ref-walk mode.
799	* @nd: nameidata pathwalk data
800	* @dentry: next dentry to step into
801	* Returns: true on success, false on failure
802	*
803	* Similar to try_to_unlazy(), but here we have the next dentry already
804	* picked by rcu-walk and want to legitimize that in addition to the current
805	* nd->path and nd->root for ref-walk mode. Must be called from rcu-walk context.
806	* Nothing should touch nameidata between try_to_unlazy_next() failure and
807	* terminate_walk().
808	*/
809	static bool try_to_unlazy_next(struct nameidata nd, struct* dentry *dentry)
810	{
811	int res;
812	BUG_ON(!(nd->flags & LOOKUP_RCU));
813
814	if (unlikely(!legitimize_links(nd)))
815	goto out2;
816	res = __legitimize_mnt(nd->path.mnt, nd->m_seq);
817	if (unlikely(res)) {
818	if (res > `0`)
819	goto out2;
820	goto out1;
821	}
822	if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
823	goto out1;
824
825	/*
826	* We need to move both the parent and the dentry from the RCU domain
827	* to be properly refcounted. And the sequence number in the dentry
828	* validates both dentry counters, since we checked the sequence
829	* number of the parent after we got the child sequence number. So we
830	* know the parent must still be valid if the child sequence number is
831	*/
832	if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
833	goto out;
834	if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
835	goto out_dput;
836	/*
837	* Sequence counts matched. Now make sure that the root is
838	* still valid and get it if required.
839	*/
840	if (unlikely(!legitimize_root(nd)))
841	goto out_dput;
842	leave_rcu(nd);
843	return true;
844
845	out2:
846	nd->path.mnt = NULL;
847	out1:
848	nd->path.dentry = NULL;
849	out:
850	leave_rcu(nd);
851	return false;
852	out_dput:
853	leave_rcu(nd);
854	dput(dentry);
855	return false;
856	}
857
858	static inline int d_revalidate(struct dentry dentry, unsigned* int flags)
859	{
860	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
861	return dentry->d_op->d_revalidate(dentry, flags);
862	else
863	return `1`;
864	}
865
866	/**
867	* complete_walk - successful completion of path walk
868	* @nd: pointer nameidata
869	*
870	* If we had been in RCU mode, drop out of it and legitimize nd->path.
871	* Revalidate the final result, unless we'd already done that during
872	* the path walk or the filesystem doesn't ask for it. Return 0 on
873	* success, -error on failure. In case of failure caller does not
874	* need to drop nd->path.
875	*/
876	static int complete_walk(struct nameidata *nd)
877	{
878	struct dentry *dentry = nd->path.dentry;
879	int status;
880
881	if (nd->flags & LOOKUP_RCU) {
882	/*
883	* We don't want to zero nd->root for scoped-lookups or
884	* externally-managed nd->root.
885	*/
886	if (!(nd->state & ND_ROOT_PRESET))
887	if (!(nd->flags & LOOKUP_IS_SCOPED))
888	nd->root.mnt = NULL;
889	nd->flags &= ~LOOKUP_CACHED;
890	if (!try_to_unlazy(nd))
891	return -ECHILD;
892	}
893
894	if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
895	/*
896	* While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
897	* ever step outside the root during lookup" and should already
898	* be guaranteed by the rest of namei, we want to avoid a namei
899	* BUG resulting in userspace being given a path that was not
900	* scoped within the root at some point during the lookup.
901	*
902	* So, do a final sanity-check to make sure that in the
903	* worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
904	* we won't silently return an fd completely outside of the
905	* requested root to userspace.
906	*
907	* Userspace could move the path outside the root after this
908	* check, but as discussed elsewhere this is not a concern (the
909	* resolved file was inside the root at some point).
910	*/
911	if (!path_is_under(&nd->path, &nd->root))
912	return -EXDEV;
913	}
914
915	if (likely(!(nd->state & ND_JUMPED)))
916	return `0`;
917
918	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
919	return `0`;
920
921	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
922	if (status > `0`)
923	return `0`;
924
925	if (!status)
926	status = -ESTALE;
927
928	return status;
929	}
930
931	static int set_root(struct nameidata *nd)
932	{
933	struct fs_struct *fs = current->fs;
934
935	/*
936	* Jumping to the real root in a scoped-lookup is a BUG in namei, but we
937	* still have to ensure it doesn't happen because it will cause a breakout
938	* from the dirfd.
939	*/
940	if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED))
941	return -ENOTRECOVERABLE;
942
943	if (nd->flags & LOOKUP_RCU) {
944	unsigned seq;
945
946	do {
947	seq = read_seqcount_begin(&fs->seq);
948	nd->root = fs->root;
949	nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
950	} while (read_seqcount_retry(&fs->seq, seq));
951	} else {
952	get_fs_root(fs, root: &nd->root);
953	nd->state \|= ND_ROOT_GRABBED;
954	}
955	return `0`;
956	}
957
958	static int nd_jump_root(struct nameidata *nd)
959	{
960	if (unlikely(nd->flags & LOOKUP_BENEATH))
961	return -EXDEV;
962	if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
963	/ Absolute path arguments to path_init() are allowed. /
964	if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
965	return -EXDEV;
966	}
967	if (!nd->root.mnt) {
968	int error = set_root(nd);
969	if (error)
970	return error;
971	}
972	if (nd->flags & LOOKUP_RCU) {
973	struct dentry *d;
974	nd->path = nd->root;
975	d = nd->path.dentry;
976	nd->inode = d->d_inode;
977	nd->seq = nd->root_seq;
978	if (read_seqcount_retry(&d->d_seq, nd->seq))
979	return -ECHILD;
980	} else {
981	path_put(&nd->path);
982	nd->path = nd->root;
983	path_get(&nd->path);
984	nd->inode = nd->path.dentry->d_inode;
985	}
986	nd->state \|= ND_JUMPED;
987	return `0`;
988	}
989
990	/*
991	* Helper to directly jump to a known parsed path from ->get_link,
992	* caller must have taken a reference to path beforehand.
993	*/
994	int nd_jump_link(const struct path *path)
995	{
996	int error = -ELOOP;
997	struct nameidata *nd = current->nameidata;
998
999	if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
1000	goto err;
1001
1002	error = -EXDEV;
1003	if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
1004	if (nd->path.mnt != path->mnt)
1005	goto err;
1006	}
1007	/ Not currently safe for scoped-lookups. /
1008	if (unlikely(nd->flags & LOOKUP_IS_SCOPED))
1009	goto err;
1010
1011	path_put(&nd->path);
1012	nd->path = *path;
1013	nd->inode = nd->path.dentry->d_inode;
1014	nd->state \|= ND_JUMPED;
1015	return `0`;
1016
1017	err:
1018	path_put(path);
1019	return error;
1020	}
1021
1022	static inline void put_link(struct nameidata *nd)
1023	{
1024	struct saved *last = nd->stack + --nd->depth;
1025	do_delayed_call(call: &last->done);
1026	if (!(nd->flags & LOOKUP_RCU))
1027	path_put(&last->link);
1028	}
1029
1030	static int sysctl_protected_symlinks __read_mostly;
1031	static int sysctl_protected_hardlinks __read_mostly;
1032	static int sysctl_protected_fifos __read_mostly;
1033	static int sysctl_protected_regular __read_mostly;
1034
1035	#ifdef CONFIG_SYSCTL
1036	static struct ctl_table namei_sysctls[] = {
1037	{
1038	.procname = "protected_symlinks",
1039	.data = &sysctl_protected_symlinks,
1040	.maxlen = sizeof(int),
1041	.mode = `0644`,
1042	.proc_handler = proc_dointvec_minmax,
1043	.extra1 = SYSCTL_ZERO,
1044	.extra2 = SYSCTL_ONE,
1045	},
1046	{
1047	.procname = "protected_hardlinks",
1048	.data = &sysctl_protected_hardlinks,
1049	.maxlen = sizeof(int),
1050	.mode = `0644`,
1051	.proc_handler = proc_dointvec_minmax,
1052	.extra1 = SYSCTL_ZERO,
1053	.extra2 = SYSCTL_ONE,
1054	},
1055	{
1056	.procname = "protected_fifos",
1057	.data = &sysctl_protected_fifos,
1058	.maxlen = sizeof(int),
1059	.mode = `0644`,
1060	.proc_handler = proc_dointvec_minmax,
1061	.extra1 = SYSCTL_ZERO,
1062	.extra2 = SYSCTL_TWO,
1063	},
1064	{
1065	.procname = "protected_regular",
1066	.data = &sysctl_protected_regular,
1067	.maxlen = sizeof(int),
1068	.mode = `0644`,
1069	.proc_handler = proc_dointvec_minmax,
1070	.extra1 = SYSCTL_ZERO,
1071	.extra2 = SYSCTL_TWO,
1072	},
1073	};
1074
1075	static int __init init_fs_namei_sysctls(void)
1076	{
1077	register_sysctl_init("fs", namei_sysctls);
1078	return `0`;
1079	}
1080	fs_initcall(init_fs_namei_sysctls);
1081
1082	#endif /* CONFIG_SYSCTL */
1083
1084	/**
1085	* may_follow_link - Check symlink following for unsafe situations
1086	* @nd: nameidata pathwalk data
1087	* @inode: Used for idmapping.
1088	*
1089	* In the case of the sysctl_protected_symlinks sysctl being enabled,
1090	* CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
1091	* in a sticky world-writable directory. This is to protect privileged
1092	* processes from failing races against path names that may change out
1093	* from under them by way of other users creating malicious symlinks.
1094	* It will permit symlinks to be followed only when outside a sticky
1095	* world-writable directory, or when the uid of the symlink and follower
1096	* match, or when the directory owner matches the symlink's owner.
1097	*
1098	* Returns 0 if following the symlink is allowed, -ve on error.
1099	*/
1100	static inline int may_follow_link(struct nameidata nd, const* struct inode *inode)
1101	{
1102	struct mnt_idmap *idmap;
1103	vfsuid_t vfsuid;
1104
1105	if (!sysctl_protected_symlinks)
1106	return `0`;
1107
1108	idmap = mnt_idmap(mnt: nd->path.mnt);
1109	vfsuid = i_uid_into_vfsuid(idmap, inode);
1110	/ Allowed if owner and follower match. /
1111	if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
1112	return `0`;
1113
1114	/ Allowed if parent directory not sticky and world-writable. /
1115	if ((nd->dir_mode & (S_ISVTX\|S_IWOTH)) != (S_ISVTX\|S_IWOTH))
1116	return `0`;
1117
1118	/ Allowed if parent directory and link owner match. /
1119	if (vfsuid_valid(uid: nd->dir_vfsuid) && vfsuid_eq(left: nd->dir_vfsuid, right: vfsuid))
1120	return `0`;
1121
1122	if (nd->flags & LOOKUP_RCU)
1123	return -ECHILD;
1124
1125	audit_inode(name: nd->name, dentry: nd->stack[`0`].link.dentry, aflags: `0`);
1126	audit_log_path_denied(AUDIT_ANOM_LINK, operation: "follow_link");
1127	return -EACCES;
1128	}
1129
1130	/**
1131	* safe_hardlink_source - Check for safe hardlink conditions
1132	* @idmap: idmap of the mount the inode was found from
1133	* @inode: the source inode to hardlink from
1134	*
1135	* Return false if at least one of the following conditions:
1136	* - inode is not a regular file
1137	* - inode is setuid
1138	* - inode is setgid and group-exec
1139	* - access failure for read and write
1140	*
1141	* Otherwise returns true.
1142	*/
1143	static bool safe_hardlink_source(struct mnt_idmap *idmap,
1144	struct inode *inode)
1145	{
1146	umode_t mode = inode->i_mode;
1147
1148	/ Special files should not get pinned to the filesystem. /
1149	if (!S_ISREG(mode))
1150	return false;
1151
1152	/ Setuid files should not get pinned to the filesystem. /
1153	if (mode & S_ISUID)
1154	return false;
1155
1156	/ Executable setgid files should not get pinned to the filesystem. /
1157	if ((mode & (S_ISGID \| S_IXGRP)) == (S_ISGID \| S_IXGRP))
1158	return false;
1159
1160	/ Hardlinking to unreadable or unwritable sources is dangerous. /
1161	if (inode_permission(idmap, inode, MAY_READ \| MAY_WRITE))
1162	return false;
1163
1164	return true;
1165	}
1166
1167	/**
1168	* may_linkat - Check permissions for creating a hardlink
1169	* @idmap: idmap of the mount the inode was found from
1170	* @link: the source to hardlink from
1171	*
1172	* Block hardlink when all of:
1173	* - sysctl_protected_hardlinks enabled
1174	* - fsuid does not match inode
1175	* - hardlink source is unsafe (see safe_hardlink_source() above)
1176	* - not CAP_FOWNER in a namespace with the inode owner uid mapped
1177	*
1178	* If the inode has been found through an idmapped mount the idmap of
1179	* the vfsmount must be passed through @idmap. This function will then take
1180	* care to map the inode according to @idmap before checking permissions.
1181	* On non-idmapped mounts or if permission checking is to be performed on the
1182	* raw inode simply pass @nop_mnt_idmap.
1183	*
1184	* Returns 0 if successful, -ve on error.
1185	*/
1186	int may_linkat(struct mnt_idmap idmap, const* struct path *link)
1187	{
1188	struct inode *inode = link->dentry->d_inode;
1189
1190	/ Inode writeback is not safe when the uid or gid are invalid. /
1191	if (!vfsuid_valid(uid: i_uid_into_vfsuid(idmap, inode)) \|\|
1192	!vfsgid_valid(gid: i_gid_into_vfsgid(idmap, inode)))
1193	return -EOVERFLOW;
1194
1195	if (!sysctl_protected_hardlinks)
1196	return `0`;
1197
1198	/ Source inode owner (or CAP_FOWNER) can hardlink all they like,*
1199	* otherwise, it must be a safe source.
1200	*/
1201	if (safe_hardlink_source(idmap, inode) \|\|
1202	inode_owner_or_capable(idmap, inode))
1203	return `0`;
1204
1205	audit_log_path_denied(AUDIT_ANOM_LINK, operation: "linkat");
1206	return -EPERM;
1207	}
1208
1209	/**
1210	* may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
1211	* should be allowed, or not, on files that already
1212	* exist.
1213	* @idmap: idmap of the mount the inode was found from
1214	* @nd: nameidata pathwalk data
1215	* @inode: the inode of the file to open
1216	*
1217	* Block an O_CREAT open of a FIFO (or a regular file) when:
1218	* - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
1219	* - the file already exists
1220	* - we are in a sticky directory
1221	* - we don't own the file
1222	* - the owner of the directory doesn't own the file
1223	* - the directory is world writable
1224	* If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
1225	* the directory doesn't have to be world writable: being group writable will
1226	* be enough.
1227	*
1228	* If the inode has been found through an idmapped mount the idmap of
1229	* the vfsmount must be passed through @idmap. This function will then take
1230	* care to map the inode according to @idmap before checking permissions.
1231	* On non-idmapped mounts or if permission checking is to be performed on the
1232	* raw inode simply pass @nop_mnt_idmap.
1233	*
1234	* Returns 0 if the open is allowed, -ve on error.
1235	*/
1236	static int may_create_in_sticky(struct mnt_idmap *idmap,
1237	struct nameidata nd, struct* inode *const inode)
1238	{
1239	umode_t dir_mode = nd->dir_mode;
1240	vfsuid_t dir_vfsuid = nd->dir_vfsuid;
1241
1242	if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) \|\|
1243	(!sysctl_protected_regular && S_ISREG(inode->i_mode)) \|\|
1244	likely(!(dir_mode & S_ISVTX)) \|\|
1245	vfsuid_eq(left: i_uid_into_vfsuid(idmap, inode), right: dir_vfsuid) \|\|
1246	vfsuid_eq_kuid(vfsuid: i_uid_into_vfsuid(idmap, inode), current_fsuid()))
1247	return `0`;
1248
1249	if (likely(dir_mode & `0002`) \|\|
1250	(dir_mode & `0020` &&
1251	((sysctl_protected_fifos >= `2` && S_ISFIFO(inode->i_mode)) \|\|
1252	(sysctl_protected_regular >= `2` && S_ISREG(inode->i_mode))))) {
1253	const char *operation = S_ISFIFO(inode->i_mode) ?
1254	"sticky_create_fifo" :
1255	"sticky_create_regular";
1256	audit_log_path_denied(AUDIT_ANOM_CREAT, operation);
1257	return -EACCES;
1258	}
1259	return `0`;
1260	}
1261
1262	/*
1263	* follow_up - Find the mountpoint of path's vfsmount
1264	*
1265	* Given a path, find the mountpoint of its source file system.
1266	* Replace @path with the path of the mountpoint in the parent mount.
1267	* Up is towards /.
1268	*
1269	* Return 1 if we went up a level and 0 if we were already at the
1270	* root.
1271	*/
1272	int follow_up(struct path *path)
1273	{
1274	struct mount *mnt = real_mount(mnt: path->mnt);
1275	struct mount *parent;
1276	struct dentry *mountpoint;
1277
1278	read_seqlock_excl(sl: &mount_lock);
1279	parent = mnt->mnt_parent;
1280	if (parent == mnt) {
1281	read_sequnlock_excl(sl: &mount_lock);
1282	return `0`;
1283	}
1284	mntget(mnt: &parent->mnt);
1285	mountpoint = dget(dentry: mnt->mnt_mountpoint);
1286	read_sequnlock_excl(sl: &mount_lock);
1287	dput(path->dentry);
1288	path->dentry = mountpoint;
1289	mntput(mnt: path->mnt);
1290	path->mnt = &parent->mnt;
1291	return `1`;
1292	}
1293	EXPORT_SYMBOL(follow_up);
1294
1295	static bool choose_mountpoint_rcu(struct mount m, const* struct path *root,
1296	struct path path, unsigned* *seqp)
1297	{
1298	while (mnt_has_parent(mnt: m)) {
1299	struct dentry *mountpoint = m->mnt_mountpoint;
1300
1301	m = m->mnt_parent;
1302	if (unlikely(root->dentry == mountpoint &&
1303	root->mnt == &m->mnt))
1304	break;
1305	if (mountpoint != m->mnt.mnt_root) {
1306	path->mnt = &m->mnt;
1307	path->dentry = mountpoint;
1308	*seqp = read_seqcount_begin(&mountpoint->d_seq);
1309	return true;
1310	}
1311	}
1312	return false;
1313	}
1314
1315	static bool choose_mountpoint(struct mount m, const* struct path *root,
1316	struct path *path)
1317	{
1318	bool found;
1319
1320	rcu_read_lock();
1321	while (`1`) {
1322	unsigned seq, mseq = read_seqbegin(sl: &mount_lock);
1323
1324	found = choose_mountpoint_rcu(m, root, path, seqp: &seq);
1325	if (unlikely(!found)) {
1326	if (!read_seqretry(sl: &mount_lock, start: mseq))
1327	break;
1328	} else {
1329	if (likely(__legitimize_path(path, seq, mseq)))
1330	break;
1331	rcu_read_unlock();
1332	path_put(path);
1333	rcu_read_lock();
1334	}
1335	}
1336	rcu_read_unlock();
1337	return found;
1338	}
1339
1340	/*
1341	* Perform an automount
1342	* - return -EISDIR to tell follow_managed() to stop and return the path we
1343	* were called with.
1344	*/
1345	static int follow_automount(struct path path, int* count, unsigned* lookup_flags)
1346	{
1347	struct dentry *dentry = path->dentry;
1348
1349	/ We don't want to mount if someone's just doing a stat -*
1350	* unless they're stat'ing a directory and appended a '/' to
1351	* the name.
1352	*
1353	* We do, however, want to mount if someone wants to open or
1354	* create a file of any type under the mountpoint, wants to
1355	* traverse through the mountpoint or wants to open the
1356	* mounted directory. Also, autofs may mark negative dentries
1357	* as being automount points. These will need the attentions
1358	* of the daemon to instantiate them before they can be used.
1359	*/
1360	if (!(lookup_flags & (LOOKUP_PARENT \| LOOKUP_DIRECTORY \|
1361	LOOKUP_OPEN \| LOOKUP_CREATE \| LOOKUP_AUTOMOUNT)) &&
1362	dentry->d_inode)
1363	return -EISDIR;
1364
1365	if (count && (*count)++ >= MAXSYMLINKS)
1366	return -ELOOP;
1367
1368	return finish_automount(dentry->d_op->d_automount(path), path);
1369	}
1370
1371	/*
1372	* mount traversal - out-of-line part. One note on ->d_flags accesses -
1373	* dentries are pinned but not locked here, so negative dentry can go
1374	* positive right under us. Use of smp_load_acquire() provides a barrier
1375	* sufficient for ->d_inode and ->d_flags consistency.
1376	*/
1377	static int __traverse_mounts(struct path path, unsigned* flags, bool *jumped,
1378	int count, unsigned* lookup_flags)
1379	{
1380	struct vfsmount *mnt = path->mnt;
1381	bool need_mntput = false;
1382	int ret = `0`;
1383
1384	while (flags & DCACHE_MANAGED_DENTRY) {
1385	/ Allow the filesystem to manage the transit without i_mutex*
1386	* being held. */
1387	if (flags & DCACHE_MANAGE_TRANSIT) {
1388	ret = path->dentry->d_op->d_manage(path, false);
1389	flags = smp_load_acquire(&path->dentry->d_flags);
1390	if (ret < `0`)
1391	break;
1392	}
1393
1394	if (flags & DCACHE_MOUNTED) { // something's mounted on it..
1395	struct vfsmount *mounted = lookup_mnt(path);
1396	if (mounted) { // ... in our namespace
1397	dput(path->dentry);
1398	if (need_mntput)
1399	mntput(mnt: path->mnt);
1400	path->mnt = mounted;
1401	path->dentry = dget(dentry: mounted->mnt_root);
1402	// here we know it's positive
1403	flags = path->dentry->d_flags;
1404	need_mntput = true;
1405	continue;
1406	}
1407	}
1408
1409	if (!(flags & DCACHE_NEED_AUTOMOUNT))
1410	break;
1411
1412	// uncovered automount point
1413	ret = follow_automount(path, count, lookup_flags);
1414	flags = smp_load_acquire(&path->dentry->d_flags);
1415	if (ret < `0`)
1416	break;
1417	}
1418
1419	if (ret == -EISDIR)
1420	ret = `0`;
1421	// possible if you race with several mount --move
1422	if (need_mntput && path->mnt == mnt)
1423	mntput(mnt: path->mnt);
1424	if (!ret && unlikely(d_flags_negative(flags)))
1425	ret = -ENOENT;
1426	*jumped = need_mntput;
1427	return ret;
1428	}
1429
1430	static inline int traverse_mounts(struct path path, bool jumped,
1431	int count, unsigned* lookup_flags)
1432	{
1433	unsigned flags = smp_load_acquire(&path->dentry->d_flags);
1434
1435	/ fastpath /
1436	if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
1437	*jumped = false;
1438	if (unlikely(d_flags_negative(flags)))
1439	return -ENOENT;
1440	return `0`;
1441	}
1442	return __traverse_mounts(path, flags, jumped, count, lookup_flags);
1443	}
1444
1445	int follow_down_one(struct path *path)
1446	{
1447	struct vfsmount *mounted;
1448
1449	mounted = lookup_mnt(path);
1450	if (mounted) {
1451	dput(path->dentry);
1452	mntput(mnt: path->mnt);
1453	path->mnt = mounted;
1454	path->dentry = dget(dentry: mounted->mnt_root);
1455	return `1`;
1456	}
1457	return `0`;
1458	}
1459	EXPORT_SYMBOL(follow_down_one);
1460
1461	/*
1462	* Follow down to the covering mount currently visible to userspace. At each
1463	* point, the filesystem owning that dentry may be queried as to whether the
1464	* caller is permitted to proceed or not.
1465	*/
1466	int follow_down(struct path path, unsigned* int flags)
1467	{
1468	struct vfsmount *mnt = path->mnt;
1469	bool jumped;
1470	int ret = traverse_mounts(path, jumped: &jumped, NULL, lookup_flags: flags);
1471
1472	if (path->mnt != mnt)
1473	mntput(mnt);
1474	return ret;
1475	}
1476	EXPORT_SYMBOL(follow_down);
1477
1478	/*
1479	* Try to skip to top of mountpoint pile in rcuwalk mode. Fail if
1480	* we meet a managed dentry that would need blocking.
1481	*/
1482	static bool __follow_mount_rcu(struct nameidata nd, struct* path *path)
1483	{
1484	struct dentry *dentry = path->dentry;
1485	unsigned int flags = dentry->d_flags;
1486
1487	if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
1488	return true;
1489
1490	if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1491	return false;
1492
1493	for (;;) {
1494	/*
1495	* Don't forget we might have a non-mountpoint managed dentry
1496	* that wants to block transit.
1497	*/
1498	if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) {
1499	int res = dentry->d_op->d_manage(path, true);
1500	if (res)
1501	return res == -EISDIR;
1502	flags = dentry->d_flags;
1503	}
1504
1505	if (flags & DCACHE_MOUNTED) {
1506	struct mount *mounted = __lookup_mnt(path->mnt, dentry);
1507	if (mounted) {
1508	path->mnt = &mounted->mnt;
1509	dentry = path->dentry = mounted->mnt.mnt_root;
1510	nd->state \|= ND_JUMPED;
1511	nd->next_seq = read_seqcount_begin(&dentry->d_seq);
1512	flags = dentry->d_flags;
1513	// makes sure that non-RCU pathwalk could reach
1514	// this state.
1515	if (read_seqretry(sl: &mount_lock, start: nd->m_seq))
1516	return false;
1517	continue;
1518	}
1519	if (read_seqretry(sl: &mount_lock, start: nd->m_seq))
1520	return false;
1521	}
1522	return !(flags & DCACHE_NEED_AUTOMOUNT);
1523	}
1524	}
1525
1526	static inline int handle_mounts(struct nameidata nd, struct* dentry *dentry,
1527	struct path *path)
1528	{
1529	bool jumped;
1530	int ret;
1531
1532	path->mnt = nd->path.mnt;
1533	path->dentry = dentry;
1534	if (nd->flags & LOOKUP_RCU) {
1535	unsigned int seq = nd->next_seq;
1536	if (likely(__follow_mount_rcu(nd, path)))
1537	return `0`;
1538	// path and nd->next_seq might've been clobbered*
1539	path->mnt = nd->path.mnt;
1540	path->dentry = dentry;
1541	nd->next_seq = seq;
1542	if (!try_to_unlazy_next(nd, dentry))
1543	return -ECHILD;
1544	}
1545	ret = traverse_mounts(path, jumped: &jumped, count: &nd->total_link_count, lookup_flags: nd->flags);
1546	if (jumped) {
1547	if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1548	ret = -EXDEV;
1549	else
1550	nd->state \|= ND_JUMPED;
1551	}
1552	if (unlikely(ret)) {
1553	dput(path->dentry);
1554	if (path->mnt != nd->path.mnt)
1555	mntput(mnt: path->mnt);
1556	}
1557	return ret;
1558	}
1559
1560	/*
1561	* This looks up the name in dcache and possibly revalidates the found dentry.
1562	* NULL is returned if the dentry does not exist in the cache.
1563	*/
1564	static struct dentry lookup_dcache(const* struct qstr *name,
1565	struct dentry *dir,
1566	unsigned int flags)
1567	{
1568	struct dentry *dentry = d_lookup(dir, name);
1569	if (dentry) {
1570	int error = d_revalidate(dentry, flags);
1571	if (unlikely(error <= `0`)) {
1572	if (!error)
1573	d_invalidate(dentry);
1574	dput(dentry);
1575	return ERR_PTR(error);
1576	}
1577	}
1578	return dentry;
1579	}
1580
1581	/*
1582	* Parent directory has inode locked exclusive. This is one
1583	* and only case when ->lookup() gets called on non in-lookup
1584	* dentries - as the matter of fact, this only gets called
1585	* when directory is guaranteed to have no in-lookup children
1586	* at all.
1587	*/
1588	struct dentry lookup_one_qstr_excl(const* struct qstr *name,
1589	struct dentry *base,
1590	unsigned int flags)
1591	{
1592	struct dentry *dentry = lookup_dcache(name, dir: base, flags);
1593	struct dentry *old;
1594	struct inode *dir = base->d_inode;
1595
1596	if (dentry)
1597	return dentry;
1598
1599	/ Don't create child dentry for a dead directory. /
1600	if (unlikely(IS_DEADDIR(dir)))
1601	return ERR_PTR(error: -ENOENT);
1602
1603	dentry = d_alloc(base, name);
1604	if (unlikely(!dentry))
1605	return ERR_PTR(error: -ENOMEM);
1606
1607	old = dir->i_op->lookup(dir, dentry, flags);
1608	if (unlikely(old)) {
1609	dput(dentry);
1610	dentry = old;
1611	}
1612	return dentry;
1613	}
1614	EXPORT_SYMBOL(lookup_one_qstr_excl);
1615
1616	static struct dentry lookup_fast(struct* nameidata *nd)
1617	{
1618	struct dentry dentry, parent = nd->path.dentry;
1619	int status = `1`;
1620
1621	/*
1622	* Rename seqlock is not required here because in the off chance
1623	* of a false negative due to a concurrent rename, the caller is
1624	* going to fall back to non-racy lookup.
1625	*/
1626	if (nd->flags & LOOKUP_RCU) {
1627	dentry = __d_lookup_rcu(parent, name: &nd->last, seq: &nd->next_seq);
1628	if (unlikely(!dentry)) {
1629	if (!try_to_unlazy(nd))
1630	return ERR_PTR(error: -ECHILD);
1631	return NULL;
1632	}
1633
1634	/*
1635	* This sequence count validates that the parent had no
1636	* changes while we did the lookup of the dentry above.
1637	*/
1638	if (read_seqcount_retry(&parent->d_seq, nd->seq))
1639	return ERR_PTR(error: -ECHILD);
1640
1641	status = d_revalidate(dentry, flags: nd->flags);
1642	if (likely(status > `0`))
1643	return dentry;
1644	if (!try_to_unlazy_next(nd, dentry))
1645	return ERR_PTR(error: -ECHILD);
1646	if (status == -ECHILD)
1647	/ we'd been told to redo it in non-rcu mode /
1648	status = d_revalidate(dentry, flags: nd->flags);
1649	} else {
1650	dentry = __d_lookup(parent, &nd->last);
1651	if (unlikely(!dentry))
1652	return NULL;
1653	status = d_revalidate(dentry, flags: nd->flags);
1654	}
1655	if (unlikely(status <= `0`)) {
1656	if (!status)
1657	d_invalidate(dentry);
1658	dput(dentry);
1659	return ERR_PTR(error: status);
1660	}
1661	return dentry;
1662	}
1663
1664	/ Fast lookup failed, do it the slow way /
1665	static struct dentry __lookup_slow(const* struct qstr *name,
1666	struct dentry *dir,
1667	unsigned int flags)
1668	{
1669	struct dentry dentry, old;
1670	struct inode *inode = dir->d_inode;
1671	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
1672
1673	/ Don't go there if it's already dead /
1674	if (unlikely(IS_DEADDIR(inode)))
1675	return ERR_PTR(error: -ENOENT);
1676	again:
1677	dentry = d_alloc_parallel(dir, name, &wq);
1678	if (IS_ERR(ptr: dentry))
1679	return dentry;
1680	if (unlikely(!d_in_lookup(dentry))) {
1681	int error = d_revalidate(dentry, flags);
1682	if (unlikely(error <= `0`)) {
1683	if (!error) {
1684	d_invalidate(dentry);
1685	dput(dentry);
1686	goto again;
1687	}
1688	dput(dentry);
1689	dentry = ERR_PTR(error);
1690	}
1691	} else {
1692	old = inode->i_op->lookup(inode, dentry, flags);
1693	d_lookup_done(dentry);
1694	if (unlikely(old)) {
1695	dput(dentry);
1696	dentry = old;
1697	}
1698	}
1699	return dentry;
1700	}
1701
1702	static struct dentry lookup_slow(const* struct qstr *name,
1703	struct dentry *dir,
1704	unsigned int flags)
1705	{
1706	struct inode *inode = dir->d_inode;
1707	struct dentry *res;
1708	inode_lock_shared(inode);
1709	res = __lookup_slow(name, dir, flags);
1710	inode_unlock_shared(inode);
1711	return res;
1712	}
1713
1714	static inline int may_lookup(struct mnt_idmap *idmap,
1715	struct nameidata *nd)
1716	{
1717	if (nd->flags & LOOKUP_RCU) {
1718	int err = inode_permission(idmap, nd->inode, MAY_EXEC\|MAY_NOT_BLOCK);
1719	if (!err) // success, keep going
1720	return `0`;
1721	if (!try_to_unlazy(nd))
1722	return -ECHILD; // redo it all non-lazy
1723	if (err != -ECHILD) // hard error
1724	return err;
1725	}
1726	return inode_permission(idmap, nd->inode, MAY_EXEC);
1727	}
1728
1729	static int reserve_stack(struct nameidata nd, struct* path *link)
1730	{
1731	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
1732	return -ELOOP;
1733
1734	if (likely(nd->depth != EMBEDDED_LEVELS))
1735	return `0`;
1736	if (likely(nd->stack != nd->internal))
1737	return `0`;
1738	if (likely(nd_alloc_stack(nd)))
1739	return `0`;
1740
1741	if (nd->flags & LOOKUP_RCU) {
1742	// we need to grab link before we do unlazy. And we can't skip
1743	// unlazy even if we fail to grab the link - cleanup needs it
1744	bool grabbed_link = legitimize_path(nd, path: link, seq: nd->next_seq);
1745
1746	if (!try_to_unlazy(nd) \|\| !grabbed_link)
1747	return -ECHILD;
1748
1749	if (nd_alloc_stack(nd))
1750	return `0`;
1751	}
1752	return -ENOMEM;
1753	}
1754
1755	enum {WALK_TRAILING = `1`, WALK_MORE = `2`, WALK_NOFOLLOW = `4`};
1756
1757	static const char pick_link(struct* nameidata nd, struct* path *link,
1758	struct inode inode, int* flags)
1759	{
1760	struct saved *last;
1761	const char *res;
1762	int error = reserve_stack(nd, link);
1763
1764	if (unlikely(error)) {
1765	if (!(nd->flags & LOOKUP_RCU))
1766	path_put(link);
1767	return ERR_PTR(error);
1768	}
1769	last = nd->stack + nd->depth++;
1770	last->link = *link;
1771	clear_delayed_call(call: &last->done);
1772	last->seq = nd->next_seq;
1773
1774	if (flags & WALK_TRAILING) {
1775	error = may_follow_link(nd, inode);
1776	if (unlikely(error))
1777	return ERR_PTR(error);
1778	}
1779
1780	if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) \|\|
1781	unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
1782	return ERR_PTR(error: -ELOOP);
1783
1784	if (!(nd->flags & LOOKUP_RCU)) {
1785	touch_atime(&last->link);
1786	cond_resched();
1787	} else if (atime_needs_update(&last->link, inode)) {
1788	if (!try_to_unlazy(nd))
1789	return ERR_PTR(error: -ECHILD);
1790	touch_atime(&last->link);
1791	}
1792
1793	error = security_inode_follow_link(dentry: link->dentry, inode,
1794	rcu: nd->flags & LOOKUP_RCU);
1795	if (unlikely(error))
1796	return ERR_PTR(error);
1797
1798	res = READ_ONCE(inode->i_link);
1799	if (!res) {
1800	const char * (get)(struct* dentry , struct* inode *,
1801	struct delayed_call *);
1802	get = inode->i_op->get_link;
1803	if (nd->flags & LOOKUP_RCU) {
1804	res = get(NULL, inode, &last->done);
1805	if (res == ERR_PTR(error: -ECHILD) && try_to_unlazy(nd))
1806	res = get(link->dentry, inode, &last->done);
1807	} else {
1808	res = get(link->dentry, inode, &last->done);
1809	}
1810	if (!res)
1811	goto all_done;
1812	if (IS_ERR(ptr: res))
1813	return res;
1814	}
1815	if (*res == `'/'`) {
1816	error = nd_jump_root(nd);
1817	if (unlikely(error))
1818	return ERR_PTR(error);
1819	while (unlikely(*++res == `'/'`))
1820	;
1821	}
1822	if (*res)
1823	return res;
1824	all_done: // pure jump
1825	put_link(nd);
1826	return NULL;
1827	}
1828
1829	/*
1830	* Do we need to follow links? We _really_ want to be able
1831	* to do this check without having to look at inode->i_op,
1832	* so we keep a cache of "no, this doesn't need follow_link"
1833	* for the common case.
1834	*
1835	* NOTE: dentry must be what nd->next_seq had been sampled from.
1836	*/
1837	static const char step_into(struct* nameidata nd, int* flags,
1838	struct dentry *dentry)
1839	{
1840	struct path path;
1841	struct inode *inode;
1842	int err = handle_mounts(nd, dentry, path: &path);
1843
1844	if (err < `0`)
1845	return ERR_PTR(error: err);
1846	inode = path.dentry->d_inode;
1847	if (likely(!d_is_symlink(path.dentry)) \|\|
1848	((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) \|\|
1849	(flags & WALK_NOFOLLOW)) {
1850	/ not a symlink or should not follow /
1851	if (nd->flags & LOOKUP_RCU) {
1852	if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
1853	return ERR_PTR(error: -ECHILD);
1854	if (unlikely(!inode))
1855	return ERR_PTR(error: -ENOENT);
1856	} else {
1857	dput(nd->path.dentry);
1858	if (nd->path.mnt != path.mnt)
1859	mntput(mnt: nd->path.mnt);
1860	}
1861	nd->path = path;
1862	nd->inode = inode;
1863	nd->seq = nd->next_seq;
1864	return NULL;
1865	}
1866	if (nd->flags & LOOKUP_RCU) {
1867	/ make sure that d_is_symlink above matches inode /
1868	if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
1869	return ERR_PTR(error: -ECHILD);
1870	} else {
1871	if (path.mnt == nd->path.mnt)
1872	mntget(mnt: path.mnt);
1873	}
1874	return pick_link(nd, link: &path, inode, flags);
1875	}
1876
1877	static struct dentry follow_dotdot_rcu(struct* nameidata *nd)
1878	{
1879	struct dentry parent, old;
1880
1881	if (path_equal(path1: &nd->path, path2: &nd->root))
1882	goto in_root;
1883	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
1884	struct path path;
1885	unsigned seq;
1886	if (!choose_mountpoint_rcu(m: real_mount(mnt: nd->path.mnt),
1887	root: &nd->root, path: &path, seqp: &seq))
1888	goto in_root;
1889	if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1890	return ERR_PTR(error: -ECHILD);
1891	nd->path = path;
1892	nd->inode = path.dentry->d_inode;
1893	nd->seq = seq;
1894	// makes sure that non-RCU pathwalk could reach this state
1895	if (read_seqretry(sl: &mount_lock, start: nd->m_seq))
1896	return ERR_PTR(error: -ECHILD);
1897	/ we know that mountpoint was pinned /
1898	}
1899	old = nd->path.dentry;
1900	parent = old->d_parent;
1901	nd->next_seq = read_seqcount_begin(&parent->d_seq);
1902	// makes sure that non-RCU pathwalk could reach this state
1903	if (read_seqcount_retry(&old->d_seq, nd->seq))
1904	return ERR_PTR(error: -ECHILD);
1905	if (unlikely(!path_connected(nd->path.mnt, parent)))
1906	return ERR_PTR(error: -ECHILD);
1907	return parent;
1908	in_root:
1909	if (read_seqretry(sl: &mount_lock, start: nd->m_seq))
1910	return ERR_PTR(error: -ECHILD);
1911	if (unlikely(nd->flags & LOOKUP_BENEATH))
1912	return ERR_PTR(error: -ECHILD);
1913	nd->next_seq = nd->seq;
1914	return nd->path.dentry;
1915	}
1916
1917	static struct dentry follow_dotdot(struct* nameidata *nd)
1918	{
1919	struct dentry *parent;
1920
1921	if (path_equal(path1: &nd->path, path2: &nd->root))
1922	goto in_root;
1923	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
1924	struct path path;
1925
1926	if (!choose_mountpoint(m: real_mount(mnt: nd->path.mnt),
1927	root: &nd->root, path: &path))
1928	goto in_root;
1929	path_put(&nd->path);
1930	nd->path = path;
1931	nd->inode = path.dentry->d_inode;
1932	if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1933	return ERR_PTR(error: -EXDEV);
1934	}
1935	/ rare case of legitimate dget_parent()... /
1936	parent = dget_parent(dentry: nd->path.dentry);
1937	if (unlikely(!path_connected(nd->path.mnt, parent))) {
1938	dput(parent);
1939	return ERR_PTR(error: -ENOENT);
1940	}
1941	return parent;
1942
1943	in_root:
1944	if (unlikely(nd->flags & LOOKUP_BENEATH))
1945	return ERR_PTR(error: -EXDEV);
1946	return dget(dentry: nd->path.dentry);
1947	}
1948
1949	static const char handle_dots(struct* nameidata nd, int* type)
1950	{
1951	if (type == LAST_DOTDOT) {
1952	const char *error = NULL;
1953	struct dentry *parent;
1954
1955	if (!nd->root.mnt) {
1956	error = ERR_PTR(error: set_root(nd));
1957	if (error)
1958	return error;
1959	}
1960	if (nd->flags & LOOKUP_RCU)
1961	parent = follow_dotdot_rcu(nd);
1962	else
1963	parent = follow_dotdot(nd);
1964	if (IS_ERR(ptr: parent))
1965	return ERR_CAST(ptr: parent);
1966	error = step_into(nd, flags: WALK_NOFOLLOW, dentry: parent);
1967	if (unlikely(error))
1968	return error;
1969
1970	if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
1971	/*
1972	* If there was a racing rename or mount along our
1973	* path, then we can't be sure that ".." hasn't jumped
1974	* above nd->root (and so userspace should retry or use
1975	* some fallback).
1976	*/
1977	smp_rmb();
1978	if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))
1979	return ERR_PTR(error: -EAGAIN);
1980	if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))
1981	return ERR_PTR(error: -EAGAIN);
1982	}
1983	}
1984	return NULL;
1985	}
1986
1987	static const char walk_component(struct* nameidata nd, int* flags)
1988	{
1989	struct dentry *dentry;
1990	/*
1991	* "." and ".." are special - ".." especially so because it has
1992	* to be able to know about the current root directory and
1993	* parent relationships.
1994	*/
1995	if (unlikely(nd->last_type != LAST_NORM)) {
1996	if (!(flags & WALK_MORE) && nd->depth)
1997	put_link(nd);
1998	return handle_dots(nd, type: nd->last_type);
1999	}
2000	dentry = lookup_fast(nd);
2001	if (IS_ERR(ptr: dentry))
2002	return ERR_CAST(ptr: dentry);
2003	if (unlikely(!dentry)) {
2004	dentry = lookup_slow(name: &nd->last, dir: nd->path.dentry, flags: nd->flags);
2005	if (IS_ERR(ptr: dentry))
2006	return ERR_CAST(ptr: dentry);
2007	}
2008	if (!(flags & WALK_MORE) && nd->depth)
2009	put_link(nd);
2010	return step_into(nd, flags, dentry);
2011	}
2012
2013	/*
2014	* We can do the critical dentry name comparison and hashing
2015	* operations one word at a time, but we are limited to:
2016	*
2017	* - Architectures with fast unaligned word accesses. We could
2018	* do a "get_unaligned()" if this helps and is sufficiently
2019	* fast.
2020	*
2021	* - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
2022	* do not trap on the (extremely unlikely) case of a page
2023	* crossing operation.
2024	*
2025	* - Furthermore, we need an efficient 64-bit compile for the
2026	* 64-bit case in order to generate the "number of bytes in
2027	* the final mask". Again, that could be replaced with a
2028	* efficient population count instruction or similar.
2029	*/
2030	#ifdef CONFIG_DCACHE_WORD_ACCESS
2031
2032	#include <asm/word-at-a-time.h>
2033
2034	#ifdef HASH_MIX
2035
2036	/ Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> /
2037
2038	#elif defined(CONFIG_64BIT)
2039	/*
2040	* Register pressure in the mixing function is an issue, particularly
2041	* on 32-bit x86, but almost any function requires one state value and
2042	* one temporary. Instead, use a function designed for two state values
2043	* and no temporaries.
2044	*
2045	* This function cannot create a collision in only two iterations, so
2046	* we have two iterations to achieve avalanche. In those two iterations,
2047	* we have six layers of mixing, which is enough to spread one bit's
2048	* influence out to 2^6 = 64 state bits.
2049	*
2050	* Rotate constants are scored by considering either 64 one-bit input
2051	* deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
2052	* probability of that delta causing a change to each of the 128 output
2053	* bits, using a sample of random initial states.
2054	*
2055	* The Shannon entropy of the computed probabilities is then summed
2056	* to produce a score. Ideally, any input change has a 50% chance of
2057	* toggling any given output bit.
2058	*
2059	* Mixing scores (in bits) for (12,45):
2060	* Input delta: 1-bit 2-bit
2061	* 1 round: 713.3 42542.6
2062	* 2 rounds: 2753.7 140389.8
2063	* 3 rounds: 5954.1 233458.2
2064	* 4 rounds: 7862.6 256672.2
2065	* Perfect: 8192 258048
2066	* (64128) (6463/2 * 128)
2067	*/
2068	#define HASH_MIX(x, y, a) \
2069	( x ^= (a), \
2070	y ^= x, x = rol64(x,12),\
2071	x += y, y = rol64(y,45),\
2072	y *= 9 )
2073
2074	/*
2075	* Fold two longs into one 32-bit hash value. This must be fast, but
2076	* latency isn't quite as critical, as there is a fair bit of additional
2077	* work done before the hash value is used.
2078	*/
2079	static inline unsigned int fold_hash(unsigned long x, unsigned long y)
2080	{
2081	y ^= x * GOLDEN_RATIO_64;
2082	y *= GOLDEN_RATIO_64;
2083	return y >> `32`;
2084	}
2085
2086	#else /* 32-bit case */
2087
2088	/*
2089	* Mixing scores (in bits) for (7,20):
2090	* Input delta: 1-bit 2-bit
2091	* 1 round: 330.3 9201.6
2092	* 2 rounds: 1246.4 25475.4
2093	* 3 rounds: 1907.1 31295.1
2094	* 4 rounds: 2042.3 31718.6
2095	* Perfect: 2048 31744
2096	* (3264) (3231/2 * 64)
2097	*/
2098	#define HASH_MIX(x, y, a) \
2099	( x ^= (a), \
2100	y ^= x, x = rol32(x, 7),\
2101	x += y, y = rol32(y,20),\
2102	y *= 9 )
2103
2104	static inline unsigned int fold_hash(unsigned long x, unsigned long y)
2105	{
2106	/ Use arch-optimized multiply if one exists /
2107	return __hash_32(y ^ __hash_32(x));
2108	}
2109
2110	#endif
2111
2112	/*
2113	* Return the hash of a string of known length. This is carfully
2114	* designed to match hash_name(), which is the more critical function.
2115	* In particular, we must end by hashing a final word containing 0..7
2116	* payload bytes, to match the way that hash_name() iterates until it
2117	* finds the delimiter after the name.
2118	*/
2119	unsigned int full_name_hash(const void salt, const* char name, unsigned* int len)
2120	{
2121	unsigned long a, x = `0`, y = (unsigned long)salt;
2122
2123	for (;;) {
2124	if (!len)
2125	goto done;
2126	a = load_unaligned_zeropad(addr: name);
2127	if (len < sizeof(unsigned long))
2128	break;
2129	HASH_MIX(x, y, a);
2130	name += sizeof(unsigned long);
2131	len -= sizeof(unsigned long);
2132	}
2133	x ^= a & bytemask_from_count(len);
2134	done:
2135	return fold_hash(x, y);
2136	}
2137	EXPORT_SYMBOL(full_name_hash);
2138
2139	/ Return the "hash_len" (hash and length) of a null-terminated string /
2140	u64 hashlen_string(const void salt, const* char *name)
2141	{
2142	unsigned long a = `0`, x = `0`, y = (unsigned long)salt;
2143	unsigned long adata, mask, len;
2144	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
2145
2146	len = `0`;
2147	goto inside;
2148
2149	do {
2150	HASH_MIX(x, y, a);
2151	len += sizeof(unsigned long);
2152	inside:
2153	a = load_unaligned_zeropad(addr: name+len);
2154	} while (!has_zero(a, bits: &adata, c: &constants));
2155
2156	adata = prep_zero_mask(a, bits: adata, c: &constants);
2157	mask = create_zero_mask(bits: adata);
2158	x ^= a & zero_bytemask(mask);
2159
2160	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
2161	}
2162	EXPORT_SYMBOL(hashlen_string);
2163
2164	/*
2165	* Calculate the length and hash of the path component, and
2166	* return the "hash_len" as the result.
2167	*/
2168	static inline u64 hash_name(const void salt, const* char *name)
2169	{
2170	unsigned long a = `0`, b, x = `0`, y = (unsigned long)salt;
2171	unsigned long adata, bdata, mask, len;
2172	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
2173
2174	len = `0`;
2175	goto inside;
2176
2177	do {
2178	HASH_MIX(x, y, a);
2179	len += sizeof(unsigned long);
2180	inside:
2181	a = load_unaligned_zeropad(addr: name+len);
2182	b = a ^ REPEAT_BYTE(`'/'`);
2183	} while (!(has_zero(a, bits: &adata, c: &constants) \| has_zero(a: b, bits: &bdata, c: &constants)));
2184
2185	adata = prep_zero_mask(a, bits: adata, c: &constants);
2186	bdata = prep_zero_mask(a: b, bits: bdata, c: &constants);
2187	mask = create_zero_mask(bits: adata \| bdata);
2188	x ^= a & zero_bytemask(mask);
2189
2190	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
2191	}
2192
2193	#else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
2194
2195	/ Return the hash of a string of known length /
2196	unsigned int full_name_hash(const void salt, const* char name, unsigned* int len)
2197	{
2198	unsigned long hash = init_name_hash(salt);
2199	while (len--)
2200	hash = partial_name_hash((unsigned char)*name++, hash);
2201	return end_name_hash(hash);
2202	}
2203	EXPORT_SYMBOL(full_name_hash);
2204
2205	/ Return the "hash_len" (hash and length) of a null-terminated string /
2206	u64 hashlen_string(const void salt, const* char *name)
2207	{
2208	unsigned long hash = init_name_hash(salt);
2209	unsigned long len = `0`, c;
2210
2211	c = (unsigned char)*name;
2212	while (c) {
2213	len++;
2214	hash = partial_name_hash(c, hash);
2215	c = (unsigned char)name[len];
2216	}
2217	return hashlen_create(end_name_hash(hash), len);
2218	}
2219	EXPORT_SYMBOL(hashlen_string);
2220
2221	/*
2222	* We know there's a real path component here of at least
2223	* one character.
2224	*/
2225	static inline u64 hash_name(const void salt, const* char *name)
2226	{
2227	unsigned long hash = init_name_hash(salt);
2228	unsigned long len = `0`, c;
2229
2230	c = (unsigned char)*name;
2231	do {
2232	len++;
2233	hash = partial_name_hash(c, hash);
2234	c = (unsigned char)name[len];
2235	} while (c && c != `'/'`);
2236	return hashlen_create(end_name_hash(hash), len);
2237	}
2238
2239	#endif
2240
2241	/*
2242	* Name resolution.
2243	* This is the basic name resolution function, turning a pathname into
2244	* the final dentry. We expect 'base' to be positive and a directory.
2245	*
2246	* Returns 0 and nd will have valid dentry and mnt on success.
2247	* Returns error and drops reference to input namei data on failure.
2248	*/
2249	static int link_path_walk(const char name, struct* nameidata *nd)
2250	{
2251	int depth = `0`; // depth <= nd->depth
2252	int err;
2253
2254	nd->last_type = LAST_ROOT;
2255	nd->flags \|= LOOKUP_PARENT;
2256	if (IS_ERR(ptr: name))
2257	return PTR_ERR(ptr: name);
2258	while (*name==`'/'`)
2259	name++;
2260	if (!*name) {
2261	nd->dir_mode = `0`; // short-circuit the 'hardening' idiocy
2262	return `0`;
2263	}
2264
2265	/ At this point we know we have a real path component. /
2266	for(;;) {
2267	struct mnt_idmap *idmap;
2268	const char *link;
2269	u64 hash_len;
2270	int type;
2271
2272	idmap = mnt_idmap(mnt: nd->path.mnt);
2273	err = may_lookup(idmap, nd);
2274	if (err)
2275	return err;
2276
2277	hash_len = hash_name(salt: nd->path.dentry, name);
2278
2279	type = LAST_NORM;
2280	if (name[`0`] == `'.'`) switch (hashlen_len(hash_len)) {
2281	case `2`:
2282	if (name[`1`] == `'.'`) {
2283	type = LAST_DOTDOT;
2284	nd->state \|= ND_JUMPED;
2285	}
2286	break;
2287	case `1`:
2288	type = LAST_DOT;
2289	}
2290	if (likely(type == LAST_NORM)) {
2291	struct dentry *parent = nd->path.dentry;
2292	nd->state &= ~ND_JUMPED;
2293	if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
2294	struct qstr this = { { .hash_len = hash_len }, .name = name };
2295	err = parent->d_op->d_hash(parent, &this);
2296	if (err < `0`)
2297	return err;
2298	hash_len = this.hash_len;
2299	name = this.name;
2300	}
2301	}
2302
2303	nd->last.hash_len = hash_len;
2304	nd->last.name = name;
2305	nd->last_type = type;
2306
2307	name += hashlen_len(hash_len);
2308	if (!*name)
2309	goto OK;
2310	/*
2311	* If it wasn't NUL, we know it was '/'. Skip that
2312	* slash, and continue until no more slashes.
2313	*/
2314	do {
2315	name++;
2316	} while (unlikely(*name == `'/'`));
2317	if (unlikely(!*name)) {
2318	OK:
2319	/ pathname or trailing symlink, done /
2320	if (!depth) {
2321	nd->dir_vfsuid = i_uid_into_vfsuid(idmap, inode: nd->inode);
2322	nd->dir_mode = nd->inode->i_mode;
2323	nd->flags &= ~LOOKUP_PARENT;
2324	return `0`;
2325	}
2326	/ last component of nested symlink /
2327	name = nd->stack[--depth].name;
2328	link = walk_component(nd, flags: `0`);
2329	} else {
2330	/ not the last component /
2331	link = walk_component(nd, flags: WALK_MORE);
2332	}
2333	if (unlikely(link)) {
2334	if (IS_ERR(ptr: link))
2335	return PTR_ERR(ptr: link);
2336	/ a symlink to follow /
2337	nd->stack[depth++].name = name;
2338	name = link;
2339	continue;
2340	}
2341	if (unlikely(!d_can_lookup(nd->path.dentry))) {
2342	if (nd->flags & LOOKUP_RCU) {
2343	if (!try_to_unlazy(nd))
2344	return -ECHILD;
2345	}
2346	return -ENOTDIR;
2347	}
2348	}
2349	}
2350
2351	/ must be paired with terminate_walk() /
2352	static const char path_init(struct* nameidata nd, unsigned* flags)
2353	{
2354	int error;
2355	const char *s = nd->name->name;
2356
2357	/ LOOKUP_CACHED requires RCU, ask caller to retry /
2358	if ((flags & (LOOKUP_RCU \| LOOKUP_CACHED)) == LOOKUP_CACHED)
2359	return ERR_PTR(error: -EAGAIN);
2360
2361	if (!*s)
2362	flags &= ~LOOKUP_RCU;
2363	if (flags & LOOKUP_RCU)
2364	rcu_read_lock();
2365	else
2366	nd->seq = nd->next_seq = `0`;
2367
2368	nd->flags = flags;
2369	nd->state \|= ND_JUMPED;
2370
2371	nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
2372	nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
2373	smp_rmb();
2374
2375	if (nd->state & ND_ROOT_PRESET) {
2376	struct dentry *root = nd->root.dentry;
2377	struct inode *inode = root->d_inode;
2378	if (*s && unlikely(!d_can_lookup(root)))
2379	return ERR_PTR(error: -ENOTDIR);
2380	nd->path = nd->root;
2381	nd->inode = inode;
2382	if (flags & LOOKUP_RCU) {
2383	nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2384	nd->root_seq = nd->seq;
2385	} else {
2386	path_get(&nd->path);
2387	}
2388	return s;
2389	}
2390
2391	nd->root.mnt = NULL;
2392
2393	/ Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). /
2394	if (*s == `'/'` && !(flags & LOOKUP_IN_ROOT)) {
2395	error = nd_jump_root(nd);
2396	if (unlikely(error))
2397	return ERR_PTR(error);
2398	return s;
2399	}
2400
2401	/ Relative pathname -- get the starting-point it is relative to. /
2402	if (nd->dfd == AT_FDCWD) {
2403	if (flags & LOOKUP_RCU) {
2404	struct fs_struct *fs = current->fs;
2405	unsigned seq;
2406
2407	do {
2408	seq = read_seqcount_begin(&fs->seq);
2409	nd->path = fs->pwd;
2410	nd->inode = nd->path.dentry->d_inode;
2411	nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
2412	} while (read_seqcount_retry(&fs->seq, seq));
2413	} else {
2414	get_fs_pwd(current->fs, pwd: &nd->path);
2415	nd->inode = nd->path.dentry->d_inode;
2416	}
2417	} else {
2418	/ Caller must check execute permissions on the starting path component /
2419	struct fd f = fdget_raw(fd: nd->dfd);
2420	struct dentry *dentry;
2421
2422	if (!f.file)
2423	return ERR_PTR(error: -EBADF);
2424
2425	dentry = f.file->f_path.dentry;
2426
2427	if (*s && unlikely(!d_can_lookup(dentry))) {
2428	fdput(fd: f);
2429	return ERR_PTR(error: -ENOTDIR);
2430	}
2431
2432	nd->path = f.file->f_path;
2433	if (flags & LOOKUP_RCU) {
2434	nd->inode = nd->path.dentry->d_inode;
2435	nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2436	} else {
2437	path_get(&nd->path);
2438	nd->inode = nd->path.dentry->d_inode;
2439	}
2440	fdput(fd: f);
2441	}
2442
2443	/ For scoped-lookups we need to set the root to the dirfd as well. /
2444	if (flags & LOOKUP_IS_SCOPED) {
2445	nd->root = nd->path;
2446	if (flags & LOOKUP_RCU) {
2447	nd->root_seq = nd->seq;
2448	} else {
2449	path_get(&nd->root);
2450	nd->state \|= ND_ROOT_GRABBED;
2451	}
2452	}
2453	return s;
2454	}
2455
2456	static inline const char lookup_last(struct* nameidata *nd)
2457	{
2458	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
2459	nd->flags \|= LOOKUP_FOLLOW \| LOOKUP_DIRECTORY;
2460
2461	return walk_component(nd, flags: WALK_TRAILING);
2462	}
2463
2464	static int handle_lookup_down(struct nameidata *nd)
2465	{
2466	if (!(nd->flags & LOOKUP_RCU))
2467	dget(dentry: nd->path.dentry);
2468	nd->next_seq = nd->seq;
2469	return PTR_ERR(ptr: step_into(nd, flags: WALK_NOFOLLOW, dentry: nd->path.dentry));
2470	}
2471
2472	/ Returns 0 and nd will be valid on success; Returns error, otherwise. /
2473	static int path_lookupat(struct nameidata nd, unsigned* flags, struct path *path)
2474	{
2475	const char *s = path_init(nd, flags);
2476	int err;
2477
2478	if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(ptr: s)) {
2479	err = handle_lookup_down(nd);
2480	if (unlikely(err < `0`))
2481	s = ERR_PTR(error: err);
2482	}
2483
2484	while (!(err = link_path_walk(name: s, nd)) &&
2485	(s = lookup_last(nd)) != NULL)
2486	;
2487	if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
2488	err = handle_lookup_down(nd);
2489	nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please...
2490	}
2491	if (!err)
2492	err = complete_walk(nd);
2493
2494	if (!err && nd->flags & LOOKUP_DIRECTORY)
2495	if (!d_can_lookup(dentry: nd->path.dentry))
2496	err = -ENOTDIR;
2497	if (!err) {
2498	*path = nd->path;
2499	nd->path.mnt = NULL;
2500	nd->path.dentry = NULL;
2501	}
2502	terminate_walk(nd);
2503	return err;
2504	}
2505
2506	int filename_lookup(int dfd, struct filename name, unsigned* flags,
2507	struct path path, struct* path *root)
2508	{
2509	int retval;
2510	struct nameidata nd;
2511	if (IS_ERR(ptr: name))
2512	return PTR_ERR(ptr: name);
2513	set_nameidata(p: &nd, dfd, name, root);
2514	retval = path_lookupat(nd: &nd, flags: flags \| LOOKUP_RCU, path);
2515	if (unlikely(retval == -ECHILD))
2516	retval = path_lookupat(nd: &nd, flags, path);
2517	if (unlikely(retval == -ESTALE))
2518	retval = path_lookupat(nd: &nd, flags: flags \| LOOKUP_REVAL, path);
2519
2520	if (likely(!retval))
2521	audit_inode(name, dentry: path->dentry,
2522	aflags: flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : `0`);
2523	restore_nameidata();
2524	return retval;
2525	}
2526
2527	/ Returns 0 and nd will be valid on success; Returns error, otherwise. /
2528	static int path_parentat(struct nameidata nd, unsigned* flags,
2529	struct path *parent)
2530	{
2531	const char *s = path_init(nd, flags);
2532	int err = link_path_walk(name: s, nd);
2533	if (!err)
2534	err = complete_walk(nd);
2535	if (!err) {
2536	*parent = nd->path;
2537	nd->path.mnt = NULL;
2538	nd->path.dentry = NULL;
2539	}
2540	terminate_walk(nd);
2541	return err;
2542	}
2543
2544	/ Note: this does not consume "name" /
2545	static int __filename_parentat(int dfd, struct filename *name,
2546	unsigned int flags, struct path *parent,
2547	struct qstr last, int* *type,
2548	const struct path *root)
2549	{
2550	int retval;
2551	struct nameidata nd;
2552
2553	if (IS_ERR(ptr: name))
2554	return PTR_ERR(ptr: name);
2555	set_nameidata(p: &nd, dfd, name, root);
2556	retval = path_parentat(nd: &nd, flags: flags \| LOOKUP_RCU, parent);
2557	if (unlikely(retval == -ECHILD))
2558	retval = path_parentat(nd: &nd, flags, parent);
2559	if (unlikely(retval == -ESTALE))
2560	retval = path_parentat(nd: &nd, flags: flags \| LOOKUP_REVAL, parent);
2561	if (likely(!retval)) {
2562	*last = nd.last;
2563	*type = nd.last_type;
2564	audit_inode(name, dentry: parent->dentry, AUDIT_INODE_PARENT);
2565	}
2566	restore_nameidata();
2567	return retval;
2568	}
2569
2570	static int filename_parentat(int dfd, struct filename *name,
2571	unsigned int flags, struct path *parent,
2572	struct qstr last, int* *type)
2573	{
2574	return __filename_parentat(dfd, name, flags, parent, last, type, NULL);
2575	}
2576
2577	/ does lookup, returns the object with parent locked /
2578	static struct dentry __kern_path_locked(int* dfd, struct filename name, struct* path *path)
2579	{
2580	struct dentry *d;
2581	struct qstr last;
2582	int type, error;
2583
2584	error = filename_parentat(dfd, name, flags: `0`, parent: path, last: &last, type: &type);
2585	if (error)
2586	return ERR_PTR(error);
2587	if (unlikely(type != LAST_NORM)) {
2588	path_put(path);
2589	return ERR_PTR(error: -EINVAL);
2590	}
2591	inode_lock_nested(inode: path->dentry->d_inode, subclass: I_MUTEX_PARENT);
2592	d = lookup_one_qstr_excl(&last, path->dentry, `0`);
2593	if (IS_ERR(ptr: d)) {
2594	inode_unlock(inode: path->dentry->d_inode);
2595	path_put(path);
2596	}
2597	return d;
2598	}
2599
2600	struct dentry kern_path_locked(const* char name, struct* path *path)
2601	{
2602	struct filename *filename = getname_kernel(name);
2603	struct dentry *res = __kern_path_locked(AT_FDCWD, name: filename, path);
2604
2605	putname(filename);
2606	return res;
2607	}
2608
2609	struct dentry user_path_locked_at(int* dfd, const char __user name, struct* path *path)
2610	{
2611	struct filename *filename = getname(filename: name);
2612	struct dentry *res = __kern_path_locked(dfd, name: filename, path);
2613
2614	putname(filename);
2615	return res;
2616	}
2617	EXPORT_SYMBOL(user_path_locked_at);
2618
2619	int kern_path(const char name, unsigned* int flags, struct path *path)
2620	{
2621	struct filename *filename = getname_kernel(name);
2622	int ret = filename_lookup(AT_FDCWD, name: filename, flags, path, NULL);
2623
2624	putname(filename);
2625	return ret;
2626
2627	}
2628	EXPORT_SYMBOL(kern_path);
2629
2630	/**
2631	* vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair
2632	* @filename: filename structure
2633	* @flags: lookup flags
2634	* @parent: pointer to struct path to fill
2635	* @last: last component
2636	* @type: type of the last component
2637	* @root: pointer to struct path of the base directory
2638	*/
2639	int vfs_path_parent_lookup(struct filename filename, unsigned* int flags,
2640	struct path parent, struct* qstr last, int* *type,
2641	const struct path *root)
2642	{
2643	return __filename_parentat(AT_FDCWD, name: filename, flags, parent, last,
2644	type, root);
2645	}
2646	EXPORT_SYMBOL(vfs_path_parent_lookup);
2647
2648	/**
2649	* vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
2650	* @dentry: pointer to dentry of the base directory
2651	* @mnt: pointer to vfs mount of the base directory
2652	* @name: pointer to file name
2653	* @flags: lookup flags
2654	* @path: pointer to struct path to fill
2655	*/
2656	int vfs_path_lookup(struct dentry dentry, struct* vfsmount *mnt,
2657	const char name, unsigned* int flags,
2658	struct path *path)
2659	{
2660	struct filename *filename;
2661	struct path root = {.mnt = mnt, .dentry = dentry};
2662	int ret;
2663
2664	filename = getname_kernel(name);
2665	/ the first argument of filename_lookup() is ignored with root /
2666	ret = filename_lookup(AT_FDCWD, name: filename, flags, path, root: &root);
2667	putname(filename);
2668	return ret;
2669	}
2670	EXPORT_SYMBOL(vfs_path_lookup);
2671
2672	static int lookup_one_common(struct mnt_idmap *idmap,
2673	const char name, struct* dentry base, int* len,
2674	struct qstr *this)
2675	{
2676	this->name = name;
2677	this->len = len;
2678	this->hash = full_name_hash(base, name, len);
2679	if (!len)
2680	return -EACCES;
2681
2682	if (is_dot_dotdot(name, len))
2683	return -EACCES;
2684
2685	while (len--) {
2686	unsigned int c = (const* unsigned char *)name++;
2687	if (c == `'/'` \|\| c == `'\0'`)
2688	return -EACCES;
2689	}
2690	/*
2691	* See if the low-level filesystem might want
2692	* to use its own hash..
2693	*/
2694	if (base->d_flags & DCACHE_OP_HASH) {
2695	int err = base->d_op->d_hash(base, this);
2696	if (err < `0`)
2697	return err;
2698	}
2699
2700	return inode_permission(idmap, base->d_inode, MAY_EXEC);
2701	}
2702
2703	/**
2704	* try_lookup_one_len - filesystem helper to lookup single pathname component
2705	* @name: pathname component to lookup
2706	* @base: base directory to lookup from
2707	* @len: maximum length @len should be interpreted to
2708	*
2709	* Look up a dentry by name in the dcache, returning NULL if it does not
2710	* currently exist. The function does not try to create a dentry.
2711	*
2712	* Note that this routine is purely a helper for filesystem usage and should
2713	* not be called by generic code.
2714	*
2715	* The caller must hold base->i_mutex.
2716	*/
2717	struct dentry try_lookup_one_len(const* char name, struct* dentry base, int* len)
2718	{
2719	struct qstr this;
2720	int err;
2721
2722	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
2723
2724	err = lookup_one_common(idmap: &nop_mnt_idmap, name, base, len, this: &this);
2725	if (err)
2726	return ERR_PTR(error: err);
2727
2728	return lookup_dcache(name: &this, dir: base, flags: `0`);
2729	}
2730	EXPORT_SYMBOL(try_lookup_one_len);
2731
2732	/**
2733	* lookup_one_len - filesystem helper to lookup single pathname component
2734	* @name: pathname component to lookup
2735	* @base: base directory to lookup from
2736	* @len: maximum length @len should be interpreted to
2737	*
2738	* Note that this routine is purely a helper for filesystem usage and should
2739	* not be called by generic code.
2740	*
2741	* The caller must hold base->i_mutex.
2742	*/
2743	struct dentry lookup_one_len(const* char name, struct* dentry base, int* len)
2744	{
2745	struct dentry *dentry;
2746	struct qstr this;
2747	int err;
2748
2749	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
2750
2751	err = lookup_one_common(idmap: &nop_mnt_idmap, name, base, len, this: &this);
2752	if (err)
2753	return ERR_PTR(error: err);
2754
2755	dentry = lookup_dcache(name: &this, dir: base, flags: `0`);
2756	return dentry ? dentry : __lookup_slow(name: &this, dir: base, flags: `0`);
2757	}
2758	EXPORT_SYMBOL(lookup_one_len);
2759
2760	/**
2761	* lookup_one - filesystem helper to lookup single pathname component
2762	* @idmap: idmap of the mount the lookup is performed from
2763	* @name: pathname component to lookup
2764	* @base: base directory to lookup from
2765	* @len: maximum length @len should be interpreted to
2766	*
2767	* Note that this routine is purely a helper for filesystem usage and should
2768	* not be called by generic code.
2769	*
2770	* The caller must hold base->i_mutex.
2771	*/
2772	struct dentry lookup_one(struct* mnt_idmap idmap, const* char *name,
2773	struct dentry base, int* len)
2774	{
2775	struct dentry *dentry;
2776	struct qstr this;
2777	int err;
2778
2779	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
2780
2781	err = lookup_one_common(idmap, name, base, len, this: &this);
2782	if (err)
2783	return ERR_PTR(error: err);
2784
2785	dentry = lookup_dcache(name: &this, dir: base, flags: `0`);
2786	return dentry ? dentry : __lookup_slow(name: &this, dir: base, flags: `0`);
2787	}
2788	EXPORT_SYMBOL(lookup_one);
2789
2790	/**
2791	* lookup_one_unlocked - filesystem helper to lookup single pathname component
2792	* @idmap: idmap of the mount the lookup is performed from
2793	* @name: pathname component to lookup
2794	* @base: base directory to lookup from
2795	* @len: maximum length @len should be interpreted to
2796	*
2797	* Note that this routine is purely a helper for filesystem usage and should
2798	* not be called by generic code.
2799	*
2800	* Unlike lookup_one_len, it should be called without the parent
2801	* i_mutex held, and will take the i_mutex itself if necessary.
2802	*/
2803	struct dentry lookup_one_unlocked(struct* mnt_idmap *idmap,
2804	const char name, struct* dentry *base,
2805	int len)
2806	{
2807	struct qstr this;
2808	int err;
2809	struct dentry *ret;
2810
2811	err = lookup_one_common(idmap, name, base, len, this: &this);
2812	if (err)
2813	return ERR_PTR(error: err);
2814
2815	ret = lookup_dcache(name: &this, dir: base, flags: `0`);
2816	if (!ret)
2817	ret = lookup_slow(name: &this, dir: base, flags: `0`);
2818	return ret;
2819	}
2820	EXPORT_SYMBOL(lookup_one_unlocked);
2821
2822	/**
2823	* lookup_one_positive_unlocked - filesystem helper to lookup single
2824	* pathname component
2825	* @idmap: idmap of the mount the lookup is performed from
2826	* @name: pathname component to lookup
2827	* @base: base directory to lookup from
2828	* @len: maximum length @len should be interpreted to
2829	*
2830	* This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns
2831	* known positive or ERR_PTR(). This is what most of the users want.
2832	*
2833	* Note that pinned negative with unlocked parent _can_ become positive at any
2834	* time, so callers of lookup_one_unlocked() need to be very careful; pinned
2835	* positives have >d_inode stable, so this one avoids such problems.
2836	*
2837	* Note that this routine is purely a helper for filesystem usage and should
2838	* not be called by generic code.
2839	*
2840	* The helper should be called without i_mutex held.
2841	*/
2842	struct dentry lookup_one_positive_unlocked(struct* mnt_idmap *idmap,
2843	const char *name,
2844	struct dentry base, int* len)
2845	{
2846	struct dentry *ret = lookup_one_unlocked(idmap, name, base, len);
2847
2848	if (!IS_ERR(ptr: ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
2849	dput(ret);
2850	ret = ERR_PTR(error: -ENOENT);
2851	}
2852	return ret;
2853	}
2854	EXPORT_SYMBOL(lookup_one_positive_unlocked);
2855
2856	/**
2857	* lookup_one_len_unlocked - filesystem helper to lookup single pathname component
2858	* @name: pathname component to lookup
2859	* @base: base directory to lookup from
2860	* @len: maximum length @len should be interpreted to
2861	*
2862	* Note that this routine is purely a helper for filesystem usage and should
2863	* not be called by generic code.
2864	*
2865	* Unlike lookup_one_len, it should be called without the parent
2866	* i_mutex held, and will take the i_mutex itself if necessary.
2867	*/
2868	struct dentry lookup_one_len_unlocked(const* char *name,
2869	struct dentry base, int* len)
2870	{
2871	return lookup_one_unlocked(&nop_mnt_idmap, name, base, len);
2872	}
2873	EXPORT_SYMBOL(lookup_one_len_unlocked);
2874
2875	/*
2876	* Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
2877	* on negatives. Returns known positive or ERR_PTR(); that's what
2878	* most of the users want. Note that pinned negative with unlocked parent
2879	* _can_ become positive at any time, so callers of lookup_one_len_unlocked()
2880	* need to be very careful; pinned positives have ->d_inode stable, so
2881	* this one avoids such problems.
2882	*/
2883	struct dentry lookup_positive_unlocked(const* char *name,
2884	struct dentry base, int* len)
2885	{
2886	return lookup_one_positive_unlocked(&nop_mnt_idmap, name, base, len);
2887	}
2888	EXPORT_SYMBOL(lookup_positive_unlocked);
2889
2890	#ifdef CONFIG_UNIX98_PTYS
2891	int path_pts(struct path *path)
2892	{
2893	/ Find something mounted on "pts" in the same directory as*
2894	* the input path.
2895	*/
2896	struct dentry *parent = dget_parent(dentry: path->dentry);
2897	struct dentry *child;
2898	struct qstr this = QSTR_INIT("pts", `3`);
2899
2900	if (unlikely(!path_connected(path->mnt, parent))) {
2901	dput(parent);
2902	return -ENOENT;
2903	}
2904	dput(path->dentry);
2905	path->dentry = parent;
2906	child = d_hash_and_lookup(parent, &this);
2907	if (IS_ERR_OR_NULL(ptr: child))
2908	return -ENOENT;
2909
2910	path->dentry = child;
2911	dput(parent);
2912	follow_down(path, `0`);
2913	return `0`;
2914	}
2915	#endif
2916
2917	int user_path_at_empty(int dfd, const char __user name, unsigned* flags,
2918	struct path path, int* *empty)
2919	{
2920	struct filename *filename = getname_flags(filename: name, flags, empty);
2921	int ret = filename_lookup(dfd, name: filename, flags, path, NULL);
2922
2923	putname(filename);
2924	return ret;
2925	}
2926	EXPORT_SYMBOL(user_path_at_empty);
2927
2928	int __check_sticky(struct mnt_idmap idmap, struct* inode *dir,
2929	struct inode *inode)
2930	{
2931	kuid_t fsuid = current_fsuid();
2932
2933	if (vfsuid_eq_kuid(vfsuid: i_uid_into_vfsuid(idmap, inode), kuid: fsuid))
2934	return `0`;
2935	if (vfsuid_eq_kuid(vfsuid: i_uid_into_vfsuid(idmap, inode: dir), kuid: fsuid))
2936	return `0`;
2937	return !capable_wrt_inode_uidgid(idmap, inode, CAP_FOWNER);
2938	}
2939	EXPORT_SYMBOL(__check_sticky);
2940
2941	/*
2942	* Check whether we can remove a link victim from directory dir, check
2943	* whether the type of victim is right.
2944	* 1. We can't do it if dir is read-only (done in permission())
2945	* 2. We should have write and exec permissions on dir
2946	* 3. We can't remove anything from append-only dir
2947	* 4. We can't do anything with immutable dir (done in permission())
2948	* 5. If the sticky bit on dir is set we should either
2949	* a. be owner of dir, or
2950	* b. be owner of victim, or
2951	* c. have CAP_FOWNER capability
2952	* 6. If the victim is append-only or immutable we can't do antyhing with
2953	* links pointing to it.
2954	* 7. If the victim has an unknown uid or gid we can't change the inode.
2955	* 8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
2956	* 9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
2957	* 10. We can't remove a root or mountpoint.
2958	* 11. We don't allow removal of NFS sillyrenamed files; it's handled by
2959	* nfs_async_unlink().
2960	*/
2961	static int may_delete(struct mnt_idmap idmap, struct* inode *dir,
2962	struct dentry *victim, bool isdir)
2963	{
2964	struct inode *inode = d_backing_inode(upper: victim);
2965	int error;
2966
2967	if (d_is_negative(dentry: victim))
2968	return -ENOENT;
2969	BUG_ON(!inode);
2970
2971	BUG_ON(victim->d_parent->d_inode != dir);
2972
2973	/ Inode writeback is not safe when the uid or gid are invalid. /
2974	if (!vfsuid_valid(uid: i_uid_into_vfsuid(idmap, inode)) \|\|
2975	!vfsgid_valid(gid: i_gid_into_vfsgid(idmap, inode)))
2976	return -EOVERFLOW;
2977
2978	audit_inode_child(parent: dir, dentry: victim, AUDIT_TYPE_CHILD_DELETE);
2979
2980	error = inode_permission(idmap, dir, MAY_WRITE \| MAY_EXEC);
2981	if (error)
2982	return error;
2983	if (IS_APPEND(dir))
2984	return -EPERM;
2985
2986	if (check_sticky(idmap, dir, inode) \|\| IS_APPEND(inode) \|\|
2987	IS_IMMUTABLE(inode) \|\| IS_SWAPFILE(inode) \|\|
2988	HAS_UNMAPPED_ID(idmap, inode))
2989	return -EPERM;
2990	if (isdir) {
2991	if (!d_is_dir(dentry: victim))
2992	return -ENOTDIR;
2993	if (IS_ROOT(victim))
2994	return -EBUSY;
2995	} else if (d_is_dir(dentry: victim))
2996	return -EISDIR;
2997	if (IS_DEADDIR(dir))
2998	return -ENOENT;
2999	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
3000	return -EBUSY;
3001	return `0`;
3002	}
3003
3004	/ Check whether we can create an object with dentry child in directory*
3005	* dir.
3006	* 1. We can't do it if child already exists (open has special treatment for
3007	* this case, but since we are inlined it's OK)
3008	* 2. We can't do it if dir is read-only (done in permission())
3009	* 3. We can't do it if the fs can't represent the fsuid or fsgid.
3010	* 4. We should have write and exec permissions on dir
3011	* 5. We can't do it if dir is immutable (done in permission())
3012	*/
3013	static inline int may_create(struct mnt_idmap *idmap,
3014	struct inode dir, struct* dentry *child)
3015	{
3016	audit_inode_child(parent: dir, dentry: child, AUDIT_TYPE_CHILD_CREATE);
3017	if (child->d_inode)
3018	return -EEXIST;
3019	if (IS_DEADDIR(dir))
3020	return -ENOENT;
3021	if (!fsuidgid_has_mapping(sb: dir->i_sb, idmap))
3022	return -EOVERFLOW;
3023
3024	return inode_permission(idmap, dir, MAY_WRITE \| MAY_EXEC);
3025	}
3026
3027	// p1 != p2, both are on the same filesystem, ->s_vfs_rename_mutex is held
3028	static struct dentry lock_two_directories(struct* dentry p1, struct* dentry *p2)
3029	{
3030	struct dentry p = p1, q = p2, *r;
3031
3032	while ((r = p->d_parent) != p2 && r != p)
3033	p = r;
3034	if (r == p2) {
3035	// p is a child of p2 and an ancestor of p1 or p1 itself
3036	inode_lock_nested(inode: p2->d_inode, subclass: I_MUTEX_PARENT);
3037	inode_lock_nested(inode: p1->d_inode, subclass: I_MUTEX_PARENT2);
3038	return p;
3039	}
3040	// p is the root of connected component that contains p1
3041	// p2 does not occur on the path from p to p1
3042	while ((r = q->d_parent) != p1 && r != p && r != q)
3043	q = r;
3044	if (r == p1) {
3045	// q is a child of p1 and an ancestor of p2 or p2 itself
3046	inode_lock_nested(inode: p1->d_inode, subclass: I_MUTEX_PARENT);
3047	inode_lock_nested(inode: p2->d_inode, subclass: I_MUTEX_PARENT2);
3048	return q;
3049	} else if (likely(r == p)) {
3050	// both p2 and p1 are descendents of p
3051	inode_lock_nested(inode: p1->d_inode, subclass: I_MUTEX_PARENT);
3052	inode_lock_nested(inode: p2->d_inode, subclass: I_MUTEX_PARENT2);
3053	return NULL;
3054	} else { // no common ancestor at the time we'd been called
3055	mutex_unlock(lock: &p1->d_sb->s_vfs_rename_mutex);
3056	return ERR_PTR(error: -EXDEV);
3057	}
3058	}
3059
3060	/*
3061	* p1 and p2 should be directories on the same fs.
3062	*/
3063	struct dentry lock_rename(struct* dentry p1, struct* dentry *p2)
3064	{
3065	if (p1 == p2) {
3066	inode_lock_nested(inode: p1->d_inode, subclass: I_MUTEX_PARENT);
3067	return NULL;
3068	}
3069
3070	mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
3071	return lock_two_directories(p1, p2);
3072	}
3073	EXPORT_SYMBOL(lock_rename);
3074
3075	/*
3076	* c1 and p2 should be on the same fs.
3077	*/
3078	struct dentry lock_rename_child(struct* dentry c1, struct* dentry *p2)
3079	{
3080	if (READ_ONCE(c1->d_parent) == p2) {
3081	/*
3082	* hopefully won't need to touch ->s_vfs_rename_mutex at all.
3083	*/
3084	inode_lock_nested(inode: p2->d_inode, subclass: I_MUTEX_PARENT);
3085	/*
3086	* now that p2 is locked, nobody can move in or out of it,
3087	* so the test below is safe.
3088	*/
3089	if (likely(c1->d_parent == p2))
3090	return NULL;
3091
3092	/*
3093	* c1 got moved out of p2 while we'd been taking locks;
3094	* unlock and fall back to slow case.
3095	*/
3096	inode_unlock(inode: p2->d_inode);
3097	}
3098
3099	mutex_lock(&c1->d_sb->s_vfs_rename_mutex);
3100	/*
3101	* nobody can move out of any directories on this fs.
3102	*/
3103	if (likely(c1->d_parent != p2))
3104	return lock_two_directories(p1: c1->d_parent, p2);
3105
3106	/*
3107	* c1 got moved into p2 while we were taking locks;
3108	* we need p2 locked and ->s_vfs_rename_mutex unlocked,
3109	* for consistency with lock_rename().
3110	*/
3111	inode_lock_nested(inode: p2->d_inode, subclass: I_MUTEX_PARENT);
3112	mutex_unlock(lock: &c1->d_sb->s_vfs_rename_mutex);
3113	return NULL;
3114	}
3115	EXPORT_SYMBOL(lock_rename_child);
3116
3117	void unlock_rename(struct dentry p1, struct* dentry *p2)
3118	{
3119	inode_unlock(inode: p1->d_inode);
3120	if (p1 != p2) {
3121	inode_unlock(inode: p2->d_inode);
3122	mutex_unlock(lock: &p1->d_sb->s_vfs_rename_mutex);
3123	}
3124	}
3125	EXPORT_SYMBOL(unlock_rename);
3126
3127	/**
3128	* vfs_prepare_mode - prepare the mode to be used for a new inode
3129	* @idmap: idmap of the mount the inode was found from
3130	* @dir: parent directory of the new inode
3131	* @mode: mode of the new inode
3132	* @mask_perms: allowed permission by the vfs
3133	* @type: type of file to be created
3134	*
3135	* This helper consolidates and enforces vfs restrictions on the @mode of a new
3136	* object to be created.
3137	*
3138	* Umask stripping depends on whether the filesystem supports POSIX ACLs (see
3139	* the kernel documentation for mode_strip_umask()). Moving umask stripping
3140	* after setgid stripping allows the same ordering for both non-POSIX ACL and
3141	* POSIX ACL supporting filesystems.
3142	*
3143	* Note that it's currently valid for @type to be 0 if a directory is created.
3144	* Filesystems raise that flag individually and we need to check whether each
3145	* filesystem can deal with receiving S_IFDIR from the vfs before we enforce a
3146	* non-zero type.
3147	*
3148	* Returns: mode to be passed to the filesystem
3149	*/
3150	static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
3151	const struct inode *dir, umode_t mode,
3152	umode_t mask_perms, umode_t type)
3153	{
3154	mode = mode_strip_sgid(idmap, dir, mode);
3155	mode = mode_strip_umask(dir, mode);
3156
3157	/*
3158	* Apply the vfs mandated allowed permission mask and set the type of
3159	* file to be created before we call into the filesystem.
3160	*/
3161	mode &= (mask_perms & ~S_IFMT);
3162	mode \|= (type & S_IFMT);
3163
3164	return mode;
3165	}
3166
3167	/**
3168	* vfs_create - create new file
3169	* @idmap: idmap of the mount the inode was found from
3170	* @dir: inode of @dentry
3171	* @dentry: pointer to dentry of the base directory
3172	* @mode: mode of the new file
3173	* @want_excl: whether the file must not yet exist
3174	*
3175	* Create a new file.
3176	*
3177	* If the inode has been found through an idmapped mount the idmap of
3178	* the vfsmount must be passed through @idmap. This function will then take
3179	* care to map the inode according to @idmap before checking permissions.
3180	* On non-idmapped mounts or if permission checking is to be performed on the
3181	* raw inode simply pass @nop_mnt_idmap.
3182	*/
3183	int vfs_create(struct mnt_idmap idmap, struct* inode *dir,
3184	struct dentry *dentry, umode_t mode, bool want_excl)
3185	{
3186	int error;
3187
3188	error = may_create(idmap, dir, child: dentry);
3189	if (error)
3190	return error;
3191
3192	if (!dir->i_op->create)
3193	return -EACCES; / shouldn't it be ENOSYS? /
3194
3195	mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG);
3196	error = security_inode_create(dir, dentry, mode);
3197	if (error)
3198	return error;
3199	error = dir->i_op->create(idmap, dir, dentry, mode, want_excl);
3200	if (!error)
3201	fsnotify_create(dir, dentry);
3202	return error;
3203	}
3204	EXPORT_SYMBOL(vfs_create);
3205
3206	int vfs_mkobj(struct dentry *dentry, umode_t mode,
3207	int (f)(struct* dentry , umode_t, void* *),
3208	void *arg)
3209	{
3210	struct inode *dir = dentry->d_parent->d_inode;
3211	int error = may_create(idmap: &nop_mnt_idmap, dir, child: dentry);
3212	if (error)
3213	return error;
3214
3215	mode &= S_IALLUGO;
3216	mode \|= S_IFREG;
3217	error = security_inode_create(dir, dentry, mode);
3218	if (error)
3219	return error;
3220	error = f(dentry, mode, arg);
3221	if (!error)
3222	fsnotify_create(dir, dentry);
3223	return error;
3224	}
3225	EXPORT_SYMBOL(vfs_mkobj);
3226
3227	bool may_open_dev(const struct path *path)
3228	{
3229	return !(path->mnt->mnt_flags & MNT_NODEV) &&
3230	!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
3231	}
3232
3233	static int may_open(struct mnt_idmap idmap, const* struct path *path,
3234	int acc_mode, int flag)
3235	{
3236	struct dentry *dentry = path->dentry;
3237	struct inode *inode = dentry->d_inode;
3238	int error;
3239
3240	if (!inode)
3241	return -ENOENT;
3242
3243	switch (inode->i_mode & S_IFMT) {
3244	case S_IFLNK:
3245	return -ELOOP;
3246	case S_IFDIR:
3247	if (acc_mode & MAY_WRITE)
3248	return -EISDIR;
3249	if (acc_mode & MAY_EXEC)
3250	return -EACCES;
3251	break;
3252	case S_IFBLK:
3253	case S_IFCHR:
3254	if (!may_open_dev(path))
3255	return -EACCES;
3256	fallthrough;
3257	case S_IFIFO:
3258	case S_IFSOCK:
3259	if (acc_mode & MAY_EXEC)
3260	return -EACCES;
3261	flag &= ~O_TRUNC;
3262	break;
3263	case S_IFREG:
3264	if ((acc_mode & MAY_EXEC) && path_noexec(path))
3265	return -EACCES;
3266	break;
3267	}
3268
3269	error = inode_permission(idmap, inode, MAY_OPEN \| acc_mode);
3270	if (error)
3271	return error;
3272
3273	/*
3274	* An append-only file must be opened in append mode for writing.
3275	*/
3276	if (IS_APPEND(inode)) {
3277	if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
3278	return -EPERM;
3279	if (flag & O_TRUNC)
3280	return -EPERM;
3281	}
3282
3283	/ O_NOATIME can only be set by the owner or superuser /
3284	if (flag & O_NOATIME && !inode_owner_or_capable(idmap, inode))
3285	return -EPERM;
3286
3287	return `0`;
3288	}
3289
3290	static int handle_truncate(struct mnt_idmap idmap, struct* file *filp)
3291	{
3292	const struct path *path = &filp->f_path;
3293	struct inode *inode = path->dentry->d_inode;
3294	int error = get_write_access(inode);
3295	if (error)
3296	return error;
3297
3298	error = security_file_truncate(file: filp);
3299	if (!error) {
3300	error = do_truncate(idmap, path->dentry, start: `0`,
3301	ATTR_MTIME\|ATTR_CTIME\|ATTR_OPEN,
3302	filp);
3303	}
3304	put_write_access(inode);
3305	return error;
3306	}
3307
3308	static inline int open_to_namei_flags(int flag)
3309	{
3310	if ((flag & O_ACCMODE) == `3`)
3311	flag--;
3312	return flag;
3313	}
3314
3315	static int may_o_create(struct mnt_idmap *idmap,
3316	const struct path dir, struct* dentry *dentry,
3317	umode_t mode)
3318	{
3319	int error = security_path_mknod(dir, dentry, mode, dev: `0`);
3320	if (error)
3321	return error;
3322
3323	if (!fsuidgid_has_mapping(sb: dir->dentry->d_sb, idmap))
3324	return -EOVERFLOW;
3325
3326	error = inode_permission(idmap, dir->dentry->d_inode,
3327	MAY_WRITE \| MAY_EXEC);
3328	if (error)
3329	return error;
3330
3331	return security_inode_create(dir: dir->dentry->d_inode, dentry, mode);
3332	}
3333
3334	/*
3335	* Attempt to atomically look up, create and open a file from a negative
3336	* dentry.
3337	*
3338	* Returns 0 if successful. The file will have been created and attached to
3339	* @file by the filesystem calling finish_open().
3340	*
3341	* If the file was looked up only or didn't need creating, FMODE_OPENED won't
3342	* be set. The caller will need to perform the open themselves. @path will
3343	* have been updated to point to the new dentry. This may be negative.
3344	*
3345	* Returns an error code otherwise.
3346	*/
3347	static struct dentry atomic_open(struct* nameidata nd, struct* dentry *dentry,
3348	struct file *file,
3349	int open_flag, umode_t mode)
3350	{
3351	struct dentry *const DENTRY_NOT_SET = (void *) -`1UL`;
3352	struct inode *dir = nd->path.dentry->d_inode;
3353	int error;
3354
3355	if (nd->flags & LOOKUP_DIRECTORY)
3356	open_flag \|= O_DIRECTORY;
3357
3358	file->f_path.dentry = DENTRY_NOT_SET;
3359	file->f_path.mnt = nd->path.mnt;
3360	error = dir->i_op->atomic_open(dir, dentry, file,
3361	open_to_namei_flags(flag: open_flag), mode);
3362	d_lookup_done(dentry);
3363	if (!error) {
3364	if (file->f_mode & FMODE_OPENED) {
3365	if (unlikely(dentry != file->f_path.dentry)) {
3366	dput(dentry);
3367	dentry = dget(dentry: file->f_path.dentry);
3368	}
3369	} else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
3370	error = -EIO;
3371	} else {
3372	if (file->f_path.dentry) {
3373	dput(dentry);
3374	dentry = file->f_path.dentry;
3375	}
3376	if (unlikely(d_is_negative(dentry)))
3377	error = -ENOENT;
3378	}
3379	}
3380	if (error) {
3381	dput(dentry);
3382	dentry = ERR_PTR(error);
3383	}
3384	return dentry;
3385	}
3386
3387	/*
3388	* Look up and maybe create and open the last component.
3389	*
3390	* Must be called with parent locked (exclusive in O_CREAT case).
3391	*
3392	* Returns 0 on success, that is, if
3393	* the file was successfully atomically created (if necessary) and opened, or
3394	* the file was not completely opened at this time, though lookups and
3395	* creations were performed.
3396	* These case are distinguished by presence of FMODE_OPENED on file->f_mode.
3397	* In the latter case dentry returned in @path might be negative if O_CREAT
3398	* hadn't been specified.
3399	*
3400	* An error code is returned on failure.
3401	*/
3402	static struct dentry lookup_open(struct* nameidata nd, struct* file *file,
3403	const struct open_flags *op,
3404	bool got_write)
3405	{
3406	struct mnt_idmap *idmap;
3407	struct dentry *dir = nd->path.dentry;
3408	struct inode *dir_inode = dir->d_inode;
3409	int open_flag = op->open_flag;
3410	struct dentry *dentry;
3411	int error, create_error = `0`;
3412	umode_t mode = op->mode;
3413	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
3414
3415	if (unlikely(IS_DEADDIR(dir_inode)))
3416	return ERR_PTR(error: -ENOENT);
3417
3418	file->f_mode &= ~FMODE_CREATED;
3419	dentry = d_lookup(dir, &nd->last);
3420	for (;;) {
3421	if (!dentry) {
3422	dentry = d_alloc_parallel(dir, &nd->last, &wq);
3423	if (IS_ERR(ptr: dentry))
3424	return dentry;
3425	}
3426	if (d_in_lookup(dentry))
3427	break;
3428
3429	error = d_revalidate(dentry, flags: nd->flags);
3430	if (likely(error > `0`))
3431	break;
3432	if (error)
3433	goto out_dput;
3434	d_invalidate(dentry);
3435	dput(dentry);
3436	dentry = NULL;
3437	}
3438	if (dentry->d_inode) {
3439	/ Cached positive dentry: will open in f_op->open /
3440	return dentry;
3441	}
3442
3443	/*
3444	* Checking write permission is tricky, bacuse we don't know if we are
3445	* going to actually need it: O_CREAT opens should work as long as the
3446	* file exists. But checking existence breaks atomicity. The trick is
3447	* to check access and if not granted clear O_CREAT from the flags.
3448	*
3449	* Another problem is returing the "right" error value (e.g. for an
3450	* O_EXCL open we want to return EEXIST not EROFS).
3451	*/
3452	if (unlikely(!got_write))
3453	open_flag &= ~O_TRUNC;
3454	idmap = mnt_idmap(mnt: nd->path.mnt);
3455	if (open_flag & O_CREAT) {
3456	if (open_flag & O_EXCL)
3457	open_flag &= ~O_TRUNC;
3458	mode = vfs_prepare_mode(idmap, dir: dir->d_inode, mode, mask_perms: mode, type: mode);
3459	if (likely(got_write))
3460	create_error = may_o_create(idmap, dir: &nd->path,
3461	dentry, mode);
3462	else
3463	create_error = -EROFS;
3464	}
3465	if (create_error)
3466	open_flag &= ~O_CREAT;
3467	if (dir_inode->i_op->atomic_open) {
3468	dentry = atomic_open(nd, dentry, file, open_flag, mode);
3469	if (unlikely(create_error) && dentry == ERR_PTR(error: -ENOENT))
3470	dentry = ERR_PTR(error: create_error);
3471	return dentry;
3472	}
3473
3474	if (d_in_lookup(dentry)) {
3475	struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
3476	nd->flags);
3477	d_lookup_done(dentry);
3478	if (unlikely(res)) {
3479	if (IS_ERR(ptr: res)) {
3480	error = PTR_ERR(ptr: res);
3481	goto out_dput;
3482	}
3483	dput(dentry);
3484	dentry = res;
3485	}
3486	}
3487
3488	/ Negative dentry, just create the file /
3489	if (!dentry->d_inode && (open_flag & O_CREAT)) {
3490	file->f_mode \|= FMODE_CREATED;
3491	audit_inode_child(parent: dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
3492	if (!dir_inode->i_op->create) {
3493	error = -EACCES;
3494	goto out_dput;
3495	}
3496
3497	error = dir_inode->i_op->create(idmap, dir_inode, dentry,
3498	mode, open_flag & O_EXCL);
3499	if (error)
3500	goto out_dput;
3501	}
3502	if (unlikely(create_error) && !dentry->d_inode) {
3503	error = create_error;
3504	goto out_dput;
3505	}
3506	return dentry;
3507
3508	out_dput:
3509	dput(dentry);
3510	return ERR_PTR(error);
3511	}
3512
3513	static const char open_last_lookups(struct* nameidata *nd,
3514	struct file file, const* struct open_flags *op)
3515	{
3516	struct dentry *dir = nd->path.dentry;
3517	int open_flag = op->open_flag;
3518	bool got_write = false;
3519	struct dentry *dentry;
3520	const char *res;
3521
3522	nd->flags \|= op->intent;
3523
3524	if (nd->last_type != LAST_NORM) {
3525	if (nd->depth)
3526	put_link(nd);
3527	return handle_dots(nd, type: nd->last_type);
3528	}
3529
3530	if (!(open_flag & O_CREAT)) {
3531	if (nd->last.name[nd->last.len])
3532	nd->flags \|= LOOKUP_FOLLOW \| LOOKUP_DIRECTORY;
3533	/ we _can_ be in RCU mode here /
3534	dentry = lookup_fast(nd);
3535	if (IS_ERR(ptr: dentry))
3536	return ERR_CAST(ptr: dentry);
3537	if (likely(dentry))
3538	goto finish_lookup;
3539
3540	if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
3541	return ERR_PTR(error: -ECHILD);
3542	} else {
3543	/ create side of things /
3544	if (nd->flags & LOOKUP_RCU) {
3545	if (!try_to_unlazy(nd))
3546	return ERR_PTR(error: -ECHILD);
3547	}
3548	audit_inode(name: nd->name, dentry: dir, AUDIT_INODE_PARENT);
3549	/ trailing slashes? /
3550	if (unlikely(nd->last.name[nd->last.len]))
3551	return ERR_PTR(error: -EISDIR);
3552	}
3553
3554	if (open_flag & (O_CREAT \| O_TRUNC \| O_WRONLY \| O_RDWR)) {
3555	got_write = !mnt_want_write(mnt: nd->path.mnt);
3556	/*
3557	* do _not_ fail yet - we might not need that or fail with
3558	* a different error; let lookup_open() decide; we'll be
3559	* dropping this one anyway.
3560	*/
3561	}
3562	if (open_flag & O_CREAT)
3563	inode_lock(inode: dir->d_inode);
3564	else
3565	inode_lock_shared(inode: dir->d_inode);
3566	dentry = lookup_open(nd, file, op, got_write);
3567	if (!IS_ERR(ptr: dentry) && (file->f_mode & FMODE_CREATED))
3568	fsnotify_create(dir: dir->d_inode, dentry);
3569	if (open_flag & O_CREAT)
3570	inode_unlock(inode: dir->d_inode);
3571	else
3572	inode_unlock_shared(inode: dir->d_inode);
3573
3574	if (got_write)
3575	mnt_drop_write(mnt: nd->path.mnt);
3576
3577	if (IS_ERR(ptr: dentry))
3578	return ERR_CAST(ptr: dentry);
3579
3580	if (file->f_mode & (FMODE_OPENED \| FMODE_CREATED)) {
3581	dput(nd->path.dentry);
3582	nd->path.dentry = dentry;
3583	return NULL;
3584	}
3585
3586	finish_lookup:
3587	if (nd->depth)
3588	put_link(nd);
3589	res = step_into(nd, flags: WALK_TRAILING, dentry);
3590	if (unlikely(res))
3591	nd->flags &= ~(LOOKUP_OPEN\|LOOKUP_CREATE\|LOOKUP_EXCL);
3592	return res;
3593	}
3594
3595	/*
3596	* Handle the last step of open()
3597	*/
3598	static int do_open(struct nameidata *nd,
3599	struct file file, const* struct open_flags *op)
3600	{
3601	struct mnt_idmap *idmap;
3602	int open_flag = op->open_flag;
3603	bool do_truncate;
3604	int acc_mode;
3605	int error;
3606
3607	if (!(file->f_mode & (FMODE_OPENED \| FMODE_CREATED))) {
3608	error = complete_walk(nd);
3609	if (error)
3610	return error;
3611	}
3612	if (!(file->f_mode & FMODE_CREATED))
3613	audit_inode(name: nd->name, dentry: nd->path.dentry, aflags: `0`);
3614	idmap = mnt_idmap(mnt: nd->path.mnt);
3615	if (open_flag & O_CREAT) {
3616	if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
3617	return -EEXIST;
3618	if (d_is_dir(dentry: nd->path.dentry))
3619	return -EISDIR;
3620	error = may_create_in_sticky(idmap, nd,
3621	inode: d_backing_inode(upper: nd->path.dentry));
3622	if (unlikely(error))
3623	return error;
3624	}
3625	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(dentry: nd->path.dentry))
3626	return -ENOTDIR;
3627
3628	do_truncate = false;
3629	acc_mode = op->acc_mode;
3630	if (file->f_mode & FMODE_CREATED) {
3631	/ Don't check for write permission, don't truncate /
3632	open_flag &= ~O_TRUNC;
3633	acc_mode = `0`;
3634	} else if (d_is_reg(dentry: nd->path.dentry) && open_flag & O_TRUNC) {
3635	error = mnt_want_write(mnt: nd->path.mnt);
3636	if (error)
3637	return error;
3638	do_truncate = true;
3639	}
3640	error = may_open(idmap, path: &nd->path, acc_mode, flag: open_flag);
3641	if (!error && !(file->f_mode & FMODE_OPENED))
3642	error = vfs_open(&nd->path, file);
3643	if (!error)
3644	error = security_file_post_open(file, mask: op->acc_mode);
3645	if (!error && do_truncate)
3646	error = handle_truncate(idmap, filp: file);
3647	if (unlikely(error > `0`)) {
3648	WARN_ON(`1`);
3649	error = -EINVAL;
3650	}
3651	if (do_truncate)
3652	mnt_drop_write(mnt: nd->path.mnt);
3653	return error;
3654	}
3655
3656	/**
3657	* vfs_tmpfile - create tmpfile
3658	* @idmap: idmap of the mount the inode was found from
3659	* @parentpath: pointer to the path of the base directory
3660	* @file: file descriptor of the new tmpfile
3661	* @mode: mode of the new tmpfile
3662	*
3663	* Create a temporary file.
3664	*
3665	* If the inode has been found through an idmapped mount the idmap of
3666	* the vfsmount must be passed through @idmap. This function will then take
3667	* care to map the inode according to @idmap before checking permissions.
3668	* On non-idmapped mounts or if permission checking is to be performed on the
3669	* raw inode simply pass @nop_mnt_idmap.
3670	*/
3671	static int vfs_tmpfile(struct mnt_idmap *idmap,
3672	const struct path *parentpath,
3673	struct file *file, umode_t mode)
3674	{
3675	struct dentry *child;
3676	struct inode *dir = d_inode(dentry: parentpath->dentry);
3677	struct inode *inode;
3678	int error;
3679	int open_flag = file->f_flags;
3680
3681	/ we want directory to be writable /
3682	error = inode_permission(idmap, dir, MAY_WRITE \| MAY_EXEC);
3683	if (error)
3684	return error;
3685	if (!dir->i_op->tmpfile)
3686	return -EOPNOTSUPP;
3687	child = d_alloc(parentpath->dentry, &slash_name);
3688	if (unlikely(!child))
3689	return -ENOMEM;
3690	file->f_path.mnt = parentpath->mnt;
3691	file->f_path.dentry = child;
3692	mode = vfs_prepare_mode(idmap, dir, mode, mask_perms: mode, type: mode);
3693	error = dir->i_op->tmpfile(idmap, dir, file, mode);
3694	dput(child);
3695	if (error)
3696	return error;
3697	/ Don't check for other permissions, the inode was just created /
3698	error = may_open(idmap, path: &file->f_path, acc_mode: `0`, flag: file->f_flags);
3699	if (error)
3700	return error;
3701	inode = file_inode(f: file);
3702	if (!(open_flag & O_EXCL)) {
3703	spin_lock(lock: &inode->i_lock);
3704	inode->i_state \|= I_LINKABLE;
3705	spin_unlock(lock: &inode->i_lock);
3706	}
3707	security_inode_post_create_tmpfile(idmap, inode);
3708	return `0`;
3709	}
3710
3711	/**
3712	* kernel_tmpfile_open - open a tmpfile for kernel internal use
3713	* @idmap: idmap of the mount the inode was found from
3714	* @parentpath: path of the base directory
3715	* @mode: mode of the new tmpfile
3716	* @open_flag: flags
3717	* @cred: credentials for open
3718	*
3719	* Create and open a temporary file. The file is not accounted in nr_files,
3720	* hence this is only for kernel internal use, and must not be installed into
3721	* file tables or such.
3722	*/
3723	struct file kernel_tmpfile_open(struct* mnt_idmap *idmap,
3724	const struct path *parentpath,
3725	umode_t mode, int open_flag,
3726	const struct cred *cred)
3727	{
3728	struct file *file;
3729	int error;
3730
3731	file = alloc_empty_file_noaccount(flags: open_flag, cred);
3732	if (IS_ERR(ptr: file))
3733	return file;
3734
3735	error = vfs_tmpfile(idmap, parentpath, file, mode);
3736	if (error) {
3737	fput(file);
3738	file = ERR_PTR(error);
3739	}
3740	return file;
3741	}
3742	EXPORT_SYMBOL(kernel_tmpfile_open);
3743
3744	static int do_tmpfile(struct nameidata nd, unsigned* flags,
3745	const struct open_flags *op,
3746	struct file *file)
3747	{
3748	struct path path;
3749	int error = path_lookupat(nd, flags: flags \| LOOKUP_DIRECTORY, path: &path);
3750
3751	if (unlikely(error))
3752	return error;
3753	error = mnt_want_write(mnt: path.mnt);
3754	if (unlikely(error))
3755	goto out;
3756	error = vfs_tmpfile(idmap: mnt_idmap(mnt: path.mnt), parentpath: &path, file, mode: op->mode);
3757	if (error)
3758	goto out2;
3759	audit_inode(name: nd->name, dentry: file->f_path.dentry, aflags: `0`);
3760	out2:
3761	mnt_drop_write(mnt: path.mnt);
3762	out:
3763	path_put(&path);
3764	return error;
3765	}
3766
3767	static int do_o_path(struct nameidata nd, unsigned* flags, struct file *file)
3768	{
3769	struct path path;
3770	int error = path_lookupat(nd, flags, path: &path);
3771	if (!error) {
3772	audit_inode(name: nd->name, dentry: path.dentry, aflags: `0`);
3773	error = vfs_open(&path, file);
3774	path_put(&path);
3775	}
3776	return error;
3777	}
3778
3779	static struct file path_openat(struct* nameidata *nd,
3780	const struct open_flags op, unsigned* flags)
3781	{
3782	struct file *file;
3783	int error;
3784
3785	file = alloc_empty_file(flags: op->open_flag, current_cred());
3786	if (IS_ERR(ptr: file))
3787	return file;
3788
3789	if (unlikely(file->f_flags & __O_TMPFILE)) {
3790	error = do_tmpfile(nd, flags, op, file);
3791	} else if (unlikely(file->f_flags & O_PATH)) {
3792	error = do_o_path(nd, flags, file);
3793	} else {
3794	const char *s = path_init(nd, flags);
3795	while (!(error = link_path_walk(name: s, nd)) &&
3796	(s = open_last_lookups(nd, file, op)) != NULL)
3797	;
3798	if (!error)
3799	error = do_open(nd, file, op);
3800	terminate_walk(nd);
3801	}
3802	if (likely(!error)) {
3803	if (likely(file->f_mode & FMODE_OPENED))
3804	return file;
3805	WARN_ON(`1`);
3806	error = -EINVAL;
3807	}
3808	fput(file);
3809	if (error == -EOPENSTALE) {
3810	if (flags & LOOKUP_RCU)
3811	error = -ECHILD;
3812	else
3813	error = -ESTALE;
3814	}
3815	return ERR_PTR(error);
3816	}
3817
3818	struct file do_filp_open(int* dfd, struct filename *pathname,
3819	const struct open_flags *op)
3820	{
3821	struct nameidata nd;
3822	int flags = op->lookup_flags;
3823	struct file *filp;
3824
3825	set_nameidata(p: &nd, dfd, name: pathname, NULL);
3826	filp = path_openat(nd: &nd, op, flags: flags \| LOOKUP_RCU);
3827	if (unlikely(filp == ERR_PTR(-ECHILD)))
3828	filp = path_openat(nd: &nd, op, flags);
3829	if (unlikely(filp == ERR_PTR(-ESTALE)))
3830	filp = path_openat(nd: &nd, op, flags: flags \| LOOKUP_REVAL);
3831	restore_nameidata();
3832	return filp;
3833	}
3834
3835	struct file do_file_open_root(const* struct path *root,
3836	const char name, const* struct open_flags *op)
3837	{
3838	struct nameidata nd;
3839	struct file *file;
3840	struct filename *filename;
3841	int flags = op->lookup_flags;
3842
3843	if (d_is_symlink(dentry: root->dentry) && op->intent & LOOKUP_OPEN)
3844	return ERR_PTR(error: -ELOOP);
3845
3846	filename = getname_kernel(name);
3847	if (IS_ERR(ptr: filename))
3848	return ERR_CAST(ptr: filename);
3849
3850	set_nameidata(p: &nd, dfd: -`1`, name: filename, root);
3851	file = path_openat(nd: &nd, op, flags: flags \| LOOKUP_RCU);
3852	if (unlikely(file == ERR_PTR(-ECHILD)))
3853	file = path_openat(nd: &nd, op, flags);
3854	if (unlikely(file == ERR_PTR(-ESTALE)))
3855	file = path_openat(nd: &nd, op, flags: flags \| LOOKUP_REVAL);
3856	restore_nameidata();
3857	putname(filename);
3858	return file;
3859	}
3860
3861	static struct dentry filename_create(int* dfd, struct filename *name,
3862	struct path path, unsigned* int lookup_flags)
3863	{
3864	struct dentry *dentry = ERR_PTR(error: -EEXIST);
3865	struct qstr last;
3866	bool want_dir = lookup_flags & LOOKUP_DIRECTORY;
3867	unsigned int reval_flag = lookup_flags & LOOKUP_REVAL;
3868	unsigned int create_flags = LOOKUP_CREATE \| LOOKUP_EXCL;
3869	int type;
3870	int err2;
3871	int error;
3872
3873	error = filename_parentat(dfd, name, flags: reval_flag, parent: path, last: &last, type: &type);
3874	if (error)
3875	return ERR_PTR(error);
3876
3877	/*
3878	* Yucky last component or no last component at all?
3879	* (foo/., foo/.., /////)
3880	*/
3881	if (unlikely(type != LAST_NORM))
3882	goto out;
3883
3884	/ don't fail immediately if it's r/o, at least try to report other errors /
3885	err2 = mnt_want_write(mnt: path->mnt);
3886	/*
3887	* Do the final lookup. Suppress 'create' if there is a trailing
3888	* '/', and a directory wasn't requested.
3889	*/
3890	if (last.name[last.len] && !want_dir)
3891	create_flags = `0`;
3892	inode_lock_nested(inode: path->dentry->d_inode, subclass: I_MUTEX_PARENT);
3893	dentry = lookup_one_qstr_excl(&last, path->dentry,
3894	reval_flag \| create_flags);
3895	if (IS_ERR(ptr: dentry))
3896	goto unlock;
3897
3898	error = -EEXIST;
3899	if (d_is_positive(dentry))
3900	goto fail;
3901
3902	/*
3903	* Special case - lookup gave negative, but... we had foo/bar/
3904	* From the vfs_mknod() POV we just have a negative dentry -
3905	* all is fine. Let's be bastards - you had / on the end, you've
3906	* been asking for (non-existent) directory. -ENOENT for you.
3907	*/
3908	if (unlikely(!create_flags)) {
3909	error = -ENOENT;
3910	goto fail;
3911	}
3912	if (unlikely(err2)) {
3913	error = err2;
3914	goto fail;
3915	}
3916	return dentry;
3917	fail:
3918	dput(dentry);
3919	dentry = ERR_PTR(error);
3920	unlock:
3921	inode_unlock(inode: path->dentry->d_inode);
3922	if (!err2)
3923	mnt_drop_write(mnt: path->mnt);
3924	out:
3925	path_put(path);
3926	return dentry;
3927	}
3928
3929	struct dentry kern_path_create(int* dfd, const char *pathname,
3930	struct path path, unsigned* int lookup_flags)
3931	{
3932	struct filename *filename = getname_kernel(pathname);
3933	struct dentry *res = filename_create(dfd, name: filename, path, lookup_flags);
3934
3935	putname(filename);
3936	return res;
3937	}
3938	EXPORT_SYMBOL(kern_path_create);
3939
3940	void done_path_create(struct path path, struct* dentry *dentry)
3941	{
3942	dput(dentry);
3943	inode_unlock(inode: path->dentry->d_inode);
3944	mnt_drop_write(mnt: path->mnt);
3945	path_put(path);
3946	}
3947	EXPORT_SYMBOL(done_path_create);
3948
3949	inline struct dentry user_path_create(int* dfd, const char __user *pathname,
3950	struct path path, unsigned* int lookup_flags)
3951	{
3952	struct filename *filename = getname(filename: pathname);
3953	struct dentry *res = filename_create(dfd, name: filename, path, lookup_flags);
3954
3955	putname(filename);
3956	return res;
3957	}
3958	EXPORT_SYMBOL(user_path_create);
3959
3960	/**
3961	* vfs_mknod - create device node or file
3962	* @idmap: idmap of the mount the inode was found from
3963	* @dir: inode of @dentry
3964	* @dentry: pointer to dentry of the base directory
3965	* @mode: mode of the new device node or file
3966	* @dev: device number of device to create
3967	*
3968	* Create a device node or file.
3969	*
3970	* If the inode has been found through an idmapped mount the idmap of
3971	* the vfsmount must be passed through @idmap. This function will then take
3972	* care to map the inode according to @idmap before checking permissions.
3973	* On non-idmapped mounts or if permission checking is to be performed on the
3974	* raw inode simply pass @nop_mnt_idmap.
3975	*/
3976	int vfs_mknod(struct mnt_idmap idmap, struct* inode *dir,
3977	struct dentry *dentry, umode_t mode, dev_t dev)
3978	{
3979	bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
3980	int error = may_create(idmap, dir, child: dentry);
3981
3982	if (error)
3983	return error;
3984
3985	if ((S_ISCHR(mode) \|\| S_ISBLK(mode)) && !is_whiteout &&
3986	!capable(CAP_MKNOD))
3987	return -EPERM;
3988
3989	if (!dir->i_op->mknod)
3990	return -EPERM;
3991
3992	mode = vfs_prepare_mode(idmap, dir, mode, mask_perms: mode, type: mode);
3993	error = devcgroup_inode_mknod(mode, dev);
3994	if (error)
3995	return error;
3996
3997	error = security_inode_mknod(dir, dentry, mode, dev);
3998	if (error)
3999	return error;
4000
4001	error = dir->i_op->mknod(idmap, dir, dentry, mode, dev);
4002	if (!error)
4003	fsnotify_create(dir, dentry);
4004	return error;
4005	}
4006	EXPORT_SYMBOL(vfs_mknod);
4007
4008	static int may_mknod(umode_t mode)
4009	{
4010	switch (mode & S_IFMT) {
4011	case S_IFREG:
4012	case S_IFCHR:
4013	case S_IFBLK:
4014	case S_IFIFO:
4015	case S_IFSOCK:
4016	case `0`: / zero mode translates to S_IFREG /
4017	return `0`;
4018	case S_IFDIR:
4019	return -EPERM;
4020	default:
4021	return -EINVAL;
4022	}
4023	}
4024
4025	static int do_mknodat(int dfd, struct filename *name, umode_t mode,
4026	unsigned int dev)
4027	{
4028	struct mnt_idmap *idmap;
4029	struct dentry *dentry;
4030	struct path path;
4031	int error;
4032	unsigned int lookup_flags = `0`;
4033
4034	error = may_mknod(mode);
4035	if (error)
4036	goto out1;
4037	retry:
4038	dentry = filename_create(dfd, name, path: &path, lookup_flags);
4039	error = PTR_ERR(ptr: dentry);
4040	if (IS_ERR(ptr: dentry))
4041	goto out1;
4042
4043	error = security_path_mknod(dir: &path, dentry,
4044	mode: mode_strip_umask(dir: path.dentry->d_inode, mode), dev);
4045	if (error)
4046	goto out2;
4047
4048	idmap = mnt_idmap(mnt: path.mnt);
4049	switch (mode & S_IFMT) {
4050	case `0`: case S_IFREG:
4051	error = vfs_create(idmap, path.dentry->d_inode,
4052	dentry, mode, true);
4053	if (!error)
4054	security_path_post_mknod(idmap, dentry);
4055	break;
4056	case S_IFCHR: case S_IFBLK:
4057	error = vfs_mknod(idmap, path.dentry->d_inode,
4058	dentry, mode, new_decode_dev(dev));
4059	break;
4060	case S_IFIFO: case S_IFSOCK:
4061	error = vfs_mknod(idmap, path.dentry->d_inode,
4062	dentry, mode, `0`);
4063	break;
4064	}
4065	out2:
4066	done_path_create(&path, dentry);
4067	if (retry_estale(error, flags: lookup_flags)) {
4068	lookup_flags \|= LOOKUP_REVAL;
4069	goto retry;
4070	}
4071	out1:
4072	putname(name);
4073	return error;
4074	}
4075
4076	SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
4077	unsigned int, dev)
4078	{
4079	return do_mknodat(dfd, name: getname(filename), mode, dev);
4080	}
4081
4082	SYSCALL_DEFINE3(mknod, const char __user , filename, umode_t, mode, unsigned*, dev)
4083	{
4084	return do_mknodat(AT_FDCWD, name: getname(filename), mode, dev);
4085	}
4086
4087	/**
4088	* vfs_mkdir - create directory
4089	* @idmap: idmap of the mount the inode was found from
4090	* @dir: inode of @dentry
4091	* @dentry: pointer to dentry of the base directory
4092	* @mode: mode of the new directory
4093	*
4094	* Create a directory.
4095	*
4096	* If the inode has been found through an idmapped mount the idmap of
4097	* the vfsmount must be passed through @idmap. This function will then take
4098	* care to map the inode according to @idmap before checking permissions.
4099	* On non-idmapped mounts or if permission checking is to be performed on the
4100	* raw inode simply pass @nop_mnt_idmap.
4101	*/
4102	int vfs_mkdir(struct mnt_idmap idmap, struct* inode *dir,
4103	struct dentry *dentry, umode_t mode)
4104	{
4105	int error;
4106	unsigned max_links = dir->i_sb->s_max_links;
4107
4108	error = may_create(idmap, dir, child: dentry);
4109	if (error)
4110	return error;
4111
4112	if (!dir->i_op->mkdir)
4113	return -EPERM;
4114
4115	mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO \| S_ISVTX, type: `0`);
4116	error = security_inode_mkdir(dir, dentry, mode);
4117	if (error)
4118	return error;
4119
4120	if (max_links && dir->i_nlink >= max_links)
4121	return -EMLINK;
4122
4123	error = dir->i_op->mkdir(idmap, dir, dentry, mode);
4124	if (!error)
4125	fsnotify_mkdir(dir, dentry);
4126	return error;
4127	}
4128	EXPORT_SYMBOL(vfs_mkdir);
4129
4130	int do_mkdirat(int dfd, struct filename *name, umode_t mode)
4131	{
4132	struct dentry *dentry;
4133	struct path path;
4134	int error;
4135	unsigned int lookup_flags = LOOKUP_DIRECTORY;
4136
4137	retry:
4138	dentry = filename_create(dfd, name, path: &path, lookup_flags);
4139	error = PTR_ERR(ptr: dentry);
4140	if (IS_ERR(ptr: dentry))
4141	goto out_putname;
4142
4143	error = security_path_mkdir(dir: &path, dentry,
4144	mode: mode_strip_umask(dir: path.dentry->d_inode, mode));
4145	if (!error) {
4146	error = vfs_mkdir(mnt_idmap(mnt: path.mnt), path.dentry->d_inode,
4147	dentry, mode);
4148	}
4149	done_path_create(&path, dentry);
4150	if (retry_estale(error, flags: lookup_flags)) {
4151	lookup_flags \|= LOOKUP_REVAL;
4152	goto retry;
4153	}
4154	out_putname:
4155	putname(name);
4156	return error;
4157	}
4158
4159	SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
4160	{
4161	return do_mkdirat(dfd, name: getname(filename: pathname), mode);
4162	}
4163
4164	SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
4165	{
4166	return do_mkdirat(AT_FDCWD, name: getname(filename: pathname), mode);
4167	}
4168
4169	/**
4170	* vfs_rmdir - remove directory
4171	* @idmap: idmap of the mount the inode was found from
4172	* @dir: inode of @dentry
4173	* @dentry: pointer to dentry of the base directory
4174	*
4175	* Remove a directory.
4176	*
4177	* If the inode has been found through an idmapped mount the idmap of
4178	* the vfsmount must be passed through @idmap. This function will then take
4179	* care to map the inode according to @idmap before checking permissions.
4180	* On non-idmapped mounts or if permission checking is to be performed on the
4181	* raw inode simply pass @nop_mnt_idmap.
4182	*/
4183	int vfs_rmdir(struct mnt_idmap idmap, struct* inode *dir,
4184	struct dentry *dentry)
4185	{
4186	int error = may_delete(idmap, dir, victim: dentry, isdir: `1`);
4187
4188	if (error)
4189	return error;
4190
4191	if (!dir->i_op->rmdir)
4192	return -EPERM;
4193
4194	dget(dentry);
4195	inode_lock(inode: dentry->d_inode);
4196
4197	error = -EBUSY;
4198	if (is_local_mountpoint(dentry) \|\|
4199	(dentry->d_inode->i_flags & S_KERNEL_FILE))
4200	goto out;
4201
4202	error = security_inode_rmdir(dir, dentry);
4203	if (error)
4204	goto out;
4205
4206	error = dir->i_op->rmdir(dir, dentry);
4207	if (error)
4208	goto out;
4209
4210	shrink_dcache_parent(dentry);
4211	dentry->d_inode->i_flags \|= S_DEAD;
4212	dont_mount(dentry);
4213	detach_mounts(dentry);
4214
4215	out:
4216	inode_unlock(inode: dentry->d_inode);
4217	dput(dentry);
4218	if (!error)
4219	d_delete_notify(dir, dentry);
4220	return error;
4221	}
4222	EXPORT_SYMBOL(vfs_rmdir);
4223
4224	int do_rmdir(int dfd, struct filename *name)
4225	{
4226	int error;
4227	struct dentry *dentry;
4228	struct path path;
4229	struct qstr last;
4230	int type;
4231	unsigned int lookup_flags = `0`;
4232	retry:
4233	error = filename_parentat(dfd, name, flags: lookup_flags, parent: &path, last: &last, type: &type);
4234	if (error)
4235	goto exit1;
4236
4237	switch (type) {
4238	case LAST_DOTDOT:
4239	error = -ENOTEMPTY;
4240	goto exit2;
4241	case LAST_DOT:
4242	error = -EINVAL;
4243	goto exit2;
4244	case LAST_ROOT:
4245	error = -EBUSY;
4246	goto exit2;
4247	}
4248
4249	error = mnt_want_write(mnt: path.mnt);
4250	if (error)
4251	goto exit2;
4252
4253	inode_lock_nested(inode: path.dentry->d_inode, subclass: I_MUTEX_PARENT);
4254	dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
4255	error = PTR_ERR(ptr: dentry);
4256	if (IS_ERR(ptr: dentry))
4257	goto exit3;
4258	if (!dentry->d_inode) {
4259	error = -ENOENT;
4260	goto exit4;
4261	}
4262	error = security_path_rmdir(dir: &path, dentry);
4263	if (error)
4264	goto exit4;
4265	error = vfs_rmdir(mnt_idmap(mnt: path.mnt), path.dentry->d_inode, dentry);
4266	exit4:
4267	dput(dentry);
4268	exit3:
4269	inode_unlock(inode: path.dentry->d_inode);
4270	mnt_drop_write(mnt: path.mnt);
4271	exit2:
4272	path_put(&path);
4273	if (retry_estale(error, flags: lookup_flags)) {
4274	lookup_flags \|= LOOKUP_REVAL;
4275	goto retry;
4276	}
4277	exit1:
4278	putname(name);
4279	return error;
4280	}
4281
4282	SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
4283	{
4284	return do_rmdir(AT_FDCWD, name: getname(filename: pathname));
4285	}
4286
4287	/**
4288	* vfs_unlink - unlink a filesystem object
4289	* @idmap: idmap of the mount the inode was found from
4290	* @dir: parent directory
4291	* @dentry: victim
4292	* @delegated_inode: returns victim inode, if the inode is delegated.
4293	*
4294	* The caller must hold dir->i_mutex.
4295	*
4296	* If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
4297	* return a reference to the inode in delegated_inode. The caller
4298	* should then break the delegation on that inode and retry. Because
4299	* breaking a delegation may take a long time, the caller should drop
4300	* dir->i_mutex before doing so.
4301	*
4302	* Alternatively, a caller may pass NULL for delegated_inode. This may
4303	* be appropriate for callers that expect the underlying filesystem not
4304	* to be NFS exported.
4305	*
4306	* If the inode has been found through an idmapped mount the idmap of
4307	* the vfsmount must be passed through @idmap. This function will then take
4308	* care to map the inode according to @idmap before checking permissions.
4309	* On non-idmapped mounts or if permission checking is to be performed on the
4310	* raw inode simply pass @nop_mnt_idmap.
4311	*/
4312	int vfs_unlink(struct mnt_idmap idmap, struct* inode *dir,
4313	struct dentry dentry, struct* inode **delegated_inode)
4314	{
4315	struct inode *target = dentry->d_inode;
4316	int error = may_delete(idmap, dir, victim: dentry, isdir: `0`);
4317
4318	if (error)
4319	return error;
4320
4321	if (!dir->i_op->unlink)
4322	return -EPERM;
4323
4324	inode_lock(inode: target);
4325	if (IS_SWAPFILE(target))
4326	error = -EPERM;
4327	else if (is_local_mountpoint(dentry))
4328	error = -EBUSY;
4329	else {
4330	error = security_inode_unlink(dir, dentry);
4331	if (!error) {
4332	error = try_break_deleg(inode: target, delegated_inode);
4333	if (error)
4334	goto out;
4335	error = dir->i_op->unlink(dir, dentry);
4336	if (!error) {
4337	dont_mount(dentry);
4338	detach_mounts(dentry);
4339	}
4340	}
4341	}
4342	out:
4343	inode_unlock(inode: target);
4344
4345	/ We don't d_delete() NFS sillyrenamed files--they still exist. /
4346	if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) {
4347	fsnotify_unlink(dir, dentry);
4348	} else if (!error) {
4349	fsnotify_link_count(inode: target);
4350	d_delete_notify(dir, dentry);
4351	}
4352
4353	return error;
4354	}
4355	EXPORT_SYMBOL(vfs_unlink);
4356
4357	/*
4358	* Make sure that the actual truncation of the file will occur outside its
4359	* directory's i_mutex. Truncate can take a long time if there is a lot of
4360	* writeout happening, and we don't want to prevent access to the directory
4361	* while waiting on the I/O.
4362	*/
4363	int do_unlinkat(int dfd, struct filename *name)
4364	{
4365	int error;
4366	struct dentry *dentry;
4367	struct path path;
4368	struct qstr last;
4369	int type;
4370	struct inode *inode = NULL;
4371	struct inode *delegated_inode = NULL;
4372	unsigned int lookup_flags = `0`;
4373	retry:
4374	error = filename_parentat(dfd, name, flags: lookup_flags, parent: &path, last: &last, type: &type);
4375	if (error)
4376	goto exit1;
4377
4378	error = -EISDIR;
4379	if (type != LAST_NORM)
4380	goto exit2;
4381
4382	error = mnt_want_write(mnt: path.mnt);
4383	if (error)
4384	goto exit2;
4385	retry_deleg:
4386	inode_lock_nested(inode: path.dentry->d_inode, subclass: I_MUTEX_PARENT);
4387	dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
4388	error = PTR_ERR(ptr: dentry);
4389	if (!IS_ERR(ptr: dentry)) {
4390
4391	/ Why not before? Because we want correct error value /
4392	if (last.name[last.len] \|\| d_is_negative(dentry))
4393	goto slashes;
4394	inode = dentry->d_inode;
4395	ihold(inode);
4396	error = security_path_unlink(dir: &path, dentry);
4397	if (error)
4398	goto exit3;
4399	error = vfs_unlink(mnt_idmap(mnt: path.mnt), path.dentry->d_inode,
4400	dentry, &delegated_inode);
4401	exit3:
4402	dput(dentry);
4403	}
4404	inode_unlock(inode: path.dentry->d_inode);
4405	if (inode)
4406	iput(inode); / truncate the inode here /
4407	inode = NULL;
4408	if (delegated_inode) {
4409	error = break_deleg_wait(delegated_inode: &delegated_inode);
4410	if (!error)
4411	goto retry_deleg;
4412	}
4413	mnt_drop_write(mnt: path.mnt);
4414	exit2:
4415	path_put(&path);
4416	if (retry_estale(error, flags: lookup_flags)) {
4417	lookup_flags \|= LOOKUP_REVAL;
4418	inode = NULL;
4419	goto retry;
4420	}
4421	exit1:
4422	putname(name);
4423	return error;
4424
4425	slashes:
4426	if (d_is_negative(dentry))
4427	error = -ENOENT;
4428	else if (d_is_dir(dentry))
4429	error = -EISDIR;
4430	else
4431	error = -ENOTDIR;
4432	goto exit3;
4433	}
4434
4435	SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user , pathname, int*, flag)
4436	{
4437	if ((flag & ~AT_REMOVEDIR) != `0`)
4438	return -EINVAL;
4439
4440	if (flag & AT_REMOVEDIR)
4441	return do_rmdir(dfd, name: getname(filename: pathname));
4442	return do_unlinkat(dfd, name: getname(filename: pathname));
4443	}
4444
4445	SYSCALL_DEFINE1(unlink, const char __user *, pathname)
4446	{
4447	return do_unlinkat(AT_FDCWD, name: getname(filename: pathname));
4448	}
4449
4450	/**
4451	* vfs_symlink - create symlink
4452	* @idmap: idmap of the mount the inode was found from
4453	* @dir: inode of @dentry
4454	* @dentry: pointer to dentry of the base directory
4455	* @oldname: name of the file to link to
4456	*
4457	* Create a symlink.
4458	*
4459	* If the inode has been found through an idmapped mount the idmap of
4460	* the vfsmount must be passed through @idmap. This function will then take
4461	* care to map the inode according to @idmap before checking permissions.
4462	* On non-idmapped mounts or if permission checking is to be performed on the
4463	* raw inode simply pass @nop_mnt_idmap.
4464	*/
4465	int vfs_symlink(struct mnt_idmap idmap, struct* inode *dir,
4466	struct dentry dentry, const* char *oldname)
4467	{
4468	int error;
4469
4470	error = may_create(idmap, dir, child: dentry);
4471	if (error)
4472	return error;
4473
4474	if (!dir->i_op->symlink)
4475	return -EPERM;
4476
4477	error = security_inode_symlink(dir, dentry, old_name: oldname);
4478	if (error)
4479	return error;
4480
4481	error = dir->i_op->symlink(idmap, dir, dentry, oldname);
4482	if (!error)
4483	fsnotify_create(dir, dentry);
4484	return error;
4485	}
4486	EXPORT_SYMBOL(vfs_symlink);
4487
4488	int do_symlinkat(struct filename from, int* newdfd, struct filename *to)
4489	{
4490	int error;
4491	struct dentry *dentry;
4492	struct path path;
4493	unsigned int lookup_flags = `0`;
4494
4495	if (IS_ERR(ptr: from)) {
4496	error = PTR_ERR(ptr: from);
4497	goto out_putnames;
4498	}
4499	retry:
4500	dentry = filename_create(dfd: newdfd, name: to, path: &path, lookup_flags);
4501	error = PTR_ERR(ptr: dentry);
4502	if (IS_ERR(ptr: dentry))
4503	goto out_putnames;
4504
4505	error = security_path_symlink(dir: &path, dentry, old_name: from->name);
4506	if (!error)
4507	error = vfs_symlink(mnt_idmap(mnt: path.mnt), path.dentry->d_inode,
4508	dentry, from->name);
4509	done_path_create(&path, dentry);
4510	if (retry_estale(error, flags: lookup_flags)) {
4511	lookup_flags \|= LOOKUP_REVAL;
4512	goto retry;
4513	}
4514	out_putnames:
4515	putname(to);
4516	putname(from);
4517	return error;
4518	}
4519
4520	SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
4521	int, newdfd, const char __user *, newname)
4522	{
4523	return do_symlinkat(from: getname(filename: oldname), newdfd, to: getname(filename: newname));
4524	}
4525
4526	SYSCALL_DEFINE2(symlink, const char __user , oldname, const* char __user *, newname)
4527	{
4528	return do_symlinkat(from: getname(filename: oldname), AT_FDCWD, to: getname(filename: newname));
4529	}
4530
4531	/**
4532	* vfs_link - create a new link
4533	* @old_dentry: object to be linked
4534	* @idmap: idmap of the mount
4535	* @dir: new parent
4536	* @new_dentry: where to create the new link
4537	* @delegated_inode: returns inode needing a delegation break
4538	*
4539	* The caller must hold dir->i_mutex
4540	*
4541	* If vfs_link discovers a delegation on the to-be-linked file in need
4542	* of breaking, it will return -EWOULDBLOCK and return a reference to the
4543	* inode in delegated_inode. The caller should then break the delegation
4544	* and retry. Because breaking a delegation may take a long time, the
4545	* caller should drop the i_mutex before doing so.
4546	*
4547	* Alternatively, a caller may pass NULL for delegated_inode. This may
4548	* be appropriate for callers that expect the underlying filesystem not
4549	* to be NFS exported.
4550	*
4551	* If the inode has been found through an idmapped mount the idmap of
4552	* the vfsmount must be passed through @idmap. This function will then take
4553	* care to map the inode according to @idmap before checking permissions.
4554	* On non-idmapped mounts or if permission checking is to be performed on the
4555	* raw inode simply pass @nop_mnt_idmap.
4556	*/
4557	int vfs_link(struct dentry old_dentry, struct* mnt_idmap *idmap,
4558	struct inode dir, struct* dentry *new_dentry,
4559	struct inode **delegated_inode)
4560	{
4561	struct inode *inode = old_dentry->d_inode;
4562	unsigned max_links = dir->i_sb->s_max_links;
4563	int error;
4564
4565	if (!inode)
4566	return -ENOENT;
4567
4568	error = may_create(idmap, dir, child: new_dentry);
4569	if (error)
4570	return error;
4571
4572	if (dir->i_sb != inode->i_sb)
4573	return -EXDEV;
4574
4575	/*
4576	* A link to an append-only or immutable file cannot be created.
4577	*/
4578	if (IS_APPEND(inode) \|\| IS_IMMUTABLE(inode))
4579	return -EPERM;
4580	/*
4581	* Updating the link count will likely cause i_uid and i_gid to
4582	* be writen back improperly if their true value is unknown to
4583	* the vfs.
4584	*/
4585	if (HAS_UNMAPPED_ID(idmap, inode))
4586	return -EPERM;
4587	if (!dir->i_op->link)
4588	return -EPERM;
4589	if (S_ISDIR(inode->i_mode))
4590	return -EPERM;
4591
4592	error = security_inode_link(old_dentry, dir, new_dentry);
4593	if (error)
4594	return error;
4595
4596	inode_lock(inode);
4597	/ Make sure we don't allow creating hardlink to an unlinked file /
4598	if (inode->i_nlink == `0` && !(inode->i_state & I_LINKABLE))
4599	error = -ENOENT;
4600	else if (max_links && inode->i_nlink >= max_links)
4601	error = -EMLINK;
4602	else {
4603	error = try_break_deleg(inode, delegated_inode);
4604	if (!error)
4605	error = dir->i_op->link(old_dentry, dir, new_dentry);
4606	}
4607
4608	if (!error && (inode->i_state & I_LINKABLE)) {
4609	spin_lock(lock: &inode->i_lock);
4610	inode->i_state &= ~I_LINKABLE;
4611	spin_unlock(lock: &inode->i_lock);
4612	}
4613	inode_unlock(inode);
4614	if (!error)
4615	fsnotify_link(dir, inode, new_dentry);
4616	return error;
4617	}
4618	EXPORT_SYMBOL(vfs_link);
4619
4620	/*
4621	* Hardlinks are often used in delicate situations. We avoid
4622	* security-related surprises by not following symlinks on the
4623	* newname. --KAB
4624	*
4625	* We don't follow them on the oldname either to be compatible
4626	* with linux 2.0, and to avoid hard-linking to directories
4627	* and other special files. --ADM
4628	*/
4629	int do_linkat(int olddfd, struct filename old, int* newdfd,
4630	struct filename new, int* flags)
4631	{
4632	struct mnt_idmap *idmap;
4633	struct dentry *new_dentry;
4634	struct path old_path, new_path;
4635	struct inode *delegated_inode = NULL;
4636	int how = `0`;
4637	int error;
4638
4639	if ((flags & ~(AT_SYMLINK_FOLLOW \| AT_EMPTY_PATH)) != `0`) {
4640	error = -EINVAL;
4641	goto out_putnames;
4642	}
4643	/*
4644	* To use null names we require CAP_DAC_READ_SEARCH
4645	* This ensures that not everyone will be able to create
4646	* handlink using the passed filedescriptor.
4647	*/
4648	if (flags & AT_EMPTY_PATH && !capable(CAP_DAC_READ_SEARCH)) {
4649	error = -ENOENT;
4650	goto out_putnames;
4651	}
4652
4653	if (flags & AT_SYMLINK_FOLLOW)
4654	how \|= LOOKUP_FOLLOW;
4655	retry:
4656	error = filename_lookup(dfd: olddfd, name: old, flags: how, path: &old_path, NULL);
4657	if (error)
4658	goto out_putnames;
4659
4660	new_dentry = filename_create(dfd: newdfd, name: new, path: &new_path,
4661	lookup_flags: (how & LOOKUP_REVAL));
4662	error = PTR_ERR(ptr: new_dentry);
4663	if (IS_ERR(ptr: new_dentry))
4664	goto out_putpath;
4665
4666	error = -EXDEV;
4667	if (old_path.mnt != new_path.mnt)
4668	goto out_dput;
4669	idmap = mnt_idmap(mnt: new_path.mnt);
4670	error = may_linkat(idmap, link: &old_path);
4671	if (unlikely(error))
4672	goto out_dput;
4673	error = security_path_link(old_dentry: old_path.dentry, new_dir: &new_path, new_dentry);
4674	if (error)
4675	goto out_dput;
4676	error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode,
4677	new_dentry, &delegated_inode);
4678	out_dput:
4679	done_path_create(&new_path, new_dentry);
4680	if (delegated_inode) {
4681	error = break_deleg_wait(delegated_inode: &delegated_inode);
4682	if (!error) {
4683	path_put(&old_path);
4684	goto retry;
4685	}
4686	}
4687	if (retry_estale(error, flags: how)) {
4688	path_put(&old_path);
4689	how \|= LOOKUP_REVAL;
4690	goto retry;
4691	}
4692	out_putpath:
4693	path_put(&old_path);
4694	out_putnames:
4695	putname(old);
4696	putname(new);
4697
4698	return error;
4699	}
4700
4701	SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
4702	int, newdfd, const char __user , newname, int*, flags)
4703	{
4704	return do_linkat(olddfd, old: getname_uflags(filename: oldname, uflags: flags),
4705	newdfd, new: getname(filename: newname), flags);
4706	}
4707
4708	SYSCALL_DEFINE2(link, const char __user , oldname, const* char __user *, newname)
4709	{
4710	return do_linkat(AT_FDCWD, old: getname(filename: oldname), AT_FDCWD, new: getname(filename: newname), flags: `0`);
4711	}
4712
4713	/**
4714	* vfs_rename - rename a filesystem object
4715	* @rd: pointer to &struct renamedata info
4716	*
4717	* The caller must hold multiple mutexes--see lock_rename()).
4718	*
4719	* If vfs_rename discovers a delegation in need of breaking at either
4720	* the source or destination, it will return -EWOULDBLOCK and return a
4721	* reference to the inode in delegated_inode. The caller should then
4722	* break the delegation and retry. Because breaking a delegation may
4723	* take a long time, the caller should drop all locks before doing
4724	* so.
4725	*
4726	* Alternatively, a caller may pass NULL for delegated_inode. This may
4727	* be appropriate for callers that expect the underlying filesystem not
4728	* to be NFS exported.
4729	*
4730	* The worst of all namespace operations - renaming directory. "Perverted"
4731	* doesn't even start to describe it. Somebody in UCB had a heck of a trip...
4732	* Problems:
4733	*
4734	* a) we can get into loop creation.
4735	* b) race potential - two innocent renames can create a loop together.
4736	* That's where 4.4BSD screws up. Current fix: serialization on
4737	* sb->s_vfs_rename_mutex. We might be more accurate, but that's another
4738	* story.
4739	* c) we may have to lock up to _four_ objects - parents and victim (if it exists),
4740	* and source (if it's a non-directory or a subdirectory that moves to
4741	* different parent).
4742	* And that - after we got ->i_mutex on parents (until then we don't know
4743	* whether the target exists). Solution: try to be smart with locking
4744	* order for inodes. We rely on the fact that tree topology may change
4745	* only under ->s_vfs_rename_mutex _and_ that parent of the object we
4746	* move will be locked. Thus we can rank directories by the tree
4747	* (ancestors first) and rank all non-directories after them.
4748	* That works since everybody except rename does "lock parent, lookup,
4749	* lock child" and rename is under ->s_vfs_rename_mutex.
4750	* HOWEVER, it relies on the assumption that any object with ->lookup()
4751	* has no more than 1 dentry. If "hybrid" objects will ever appear,
4752	* we'd better make sure that there's no link(2) for them.
4753	* d) conversion from fhandle to dentry may come in the wrong moment - when
4754	* we are removing the target. Solution: we will have to grab ->i_mutex
4755	* in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
4756	* ->i_mutex on parents, which works but leads to some truly excessive
4757	* locking].
4758	*/
4759	int vfs_rename(struct renamedata *rd)
4760	{
4761	int error;
4762	struct inode old_dir = rd->old_dir, new_dir = rd->new_dir;
4763	struct dentry *old_dentry = rd->old_dentry;
4764	struct dentry *new_dentry = rd->new_dentry;
4765	struct inode **delegated_inode = rd->delegated_inode;
4766	unsigned int flags = rd->flags;
4767	bool is_dir = d_is_dir(dentry: old_dentry);
4768	struct inode *source = old_dentry->d_inode;
4769	struct inode *target = new_dentry->d_inode;
4770	bool new_is_dir = false;
4771	unsigned max_links = new_dir->i_sb->s_max_links;
4772	struct name_snapshot old_name;
4773	bool lock_old_subdir, lock_new_subdir;
4774
4775	if (source == target)
4776	return `0`;
4777
4778	error = may_delete(idmap: rd->old_mnt_idmap, dir: old_dir, victim: old_dentry, isdir: is_dir);
4779	if (error)
4780	return error;
4781
4782	if (!target) {
4783	error = may_create(idmap: rd->new_mnt_idmap, dir: new_dir, child: new_dentry);
4784	} else {
4785	new_is_dir = d_is_dir(dentry: new_dentry);
4786
4787	if (!(flags & RENAME_EXCHANGE))
4788	error = may_delete(idmap: rd->new_mnt_idmap, dir: new_dir,
4789	victim: new_dentry, isdir: is_dir);
4790	else
4791	error = may_delete(idmap: rd->new_mnt_idmap, dir: new_dir,
4792	victim: new_dentry, isdir: new_is_dir);
4793	}
4794	if (error)
4795	return error;
4796
4797	if (!old_dir->i_op->rename)
4798	return -EPERM;
4799
4800	/*
4801	* If we are going to change the parent - check write permissions,
4802	* we'll need to flip '..'.
4803	*/
4804	if (new_dir != old_dir) {
4805	if (is_dir) {
4806	error = inode_permission(rd->old_mnt_idmap, source,
4807	MAY_WRITE);
4808	if (error)
4809	return error;
4810	}
4811	if ((flags & RENAME_EXCHANGE) && new_is_dir) {
4812	error = inode_permission(rd->new_mnt_idmap, target,
4813	MAY_WRITE);
4814	if (error)
4815	return error;
4816	}
4817	}
4818
4819	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
4820	flags);
4821	if (error)
4822	return error;
4823
4824	take_dentry_name_snapshot(&old_name, old_dentry);
4825	dget(dentry: new_dentry);
4826	/*
4827	* Lock children.
4828	* The source subdirectory needs to be locked on cross-directory
4829	* rename or cross-directory exchange since its parent changes.
4830	* The target subdirectory needs to be locked on cross-directory
4831	* exchange due to parent change and on any rename due to becoming
4832	* a victim.
4833	* Non-directories need locking in all cases (for NFS reasons);
4834	* they get locked after any subdirectories (in inode address order).
4835	*
4836	* NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE.
4837	* NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex.
4838	*/
4839	lock_old_subdir = new_dir != old_dir;
4840	lock_new_subdir = new_dir != old_dir \|\| !(flags & RENAME_EXCHANGE);
4841	if (is_dir) {
4842	if (lock_old_subdir)
4843	inode_lock_nested(inode: source, subclass: I_MUTEX_CHILD);
4844	if (target && (!new_is_dir \|\| lock_new_subdir))
4845	inode_lock(inode: target);
4846	} else if (new_is_dir) {
4847	if (lock_new_subdir)
4848	inode_lock_nested(inode: target, subclass: I_MUTEX_CHILD);
4849	inode_lock(inode: source);
4850	} else {
4851	lock_two_nondirectories(source, target);
4852	}
4853
4854	error = -EPERM;
4855	if (IS_SWAPFILE(source) \|\| (target && IS_SWAPFILE(target)))
4856	goto out;
4857
4858	error = -EBUSY;
4859	if (is_local_mountpoint(dentry: old_dentry) \|\| is_local_mountpoint(dentry: new_dentry))
4860	goto out;
4861
4862	if (max_links && new_dir != old_dir) {
4863	error = -EMLINK;
4864	if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
4865	goto out;
4866	if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
4867	old_dir->i_nlink >= max_links)
4868	goto out;
4869	}
4870	if (!is_dir) {
4871	error = try_break_deleg(inode: source, delegated_inode);
4872	if (error)
4873	goto out;
4874	}
4875	if (target && !new_is_dir) {
4876	error = try_break_deleg(inode: target, delegated_inode);
4877	if (error)
4878	goto out;
4879	}
4880	error = old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry,
4881	new_dir, new_dentry, flags);
4882	if (error)
4883	goto out;
4884
4885	if (!(flags & RENAME_EXCHANGE) && target) {
4886	if (is_dir) {
4887	shrink_dcache_parent(new_dentry);
4888	target->i_flags \|= S_DEAD;
4889	}
4890	dont_mount(dentry: new_dentry);
4891	detach_mounts(dentry: new_dentry);
4892	}
4893	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
4894	if (!(flags & RENAME_EXCHANGE))
4895	d_move(old_dentry, new_dentry);
4896	else
4897	d_exchange(old_dentry, new_dentry);
4898	}
4899	out:
4900	if (!is_dir \|\| lock_old_subdir)
4901	inode_unlock(inode: source);
4902	if (target && (!new_is_dir \|\| lock_new_subdir))
4903	inode_unlock(inode: target);
4904	dput(new_dentry);
4905	if (!error) {
4906	fsnotify_move(old_dir, new_dir, old_name: &old_name.name, isdir: is_dir,
4907	target: !(flags & RENAME_EXCHANGE) ? target : NULL, moved: old_dentry);
4908	if (flags & RENAME_EXCHANGE) {
4909	fsnotify_move(old_dir: new_dir, new_dir: old_dir, old_name: &old_dentry->d_name,
4910	isdir: new_is_dir, NULL, moved: new_dentry);
4911	}
4912	}
4913	release_dentry_name_snapshot(&old_name);
4914
4915	return error;
4916	}
4917	EXPORT_SYMBOL(vfs_rename);
4918
4919	int do_renameat2(int olddfd, struct filename from, int* newdfd,
4920	struct filename to, unsigned* int flags)
4921	{
4922	struct renamedata rd;
4923	struct dentry old_dentry, new_dentry;
4924	struct dentry *trap;
4925	struct path old_path, new_path;
4926	struct qstr old_last, new_last;
4927	int old_type, new_type;
4928	struct inode *delegated_inode = NULL;
4929	unsigned int lookup_flags = `0`, target_flags = LOOKUP_RENAME_TARGET;
4930	bool should_retry = false;
4931	int error = -EINVAL;
4932
4933	if (flags & ~(RENAME_NOREPLACE \| RENAME_EXCHANGE \| RENAME_WHITEOUT))
4934	goto put_names;
4935
4936	if ((flags & (RENAME_NOREPLACE \| RENAME_WHITEOUT)) &&
4937	(flags & RENAME_EXCHANGE))
4938	goto put_names;
4939
4940	if (flags & RENAME_EXCHANGE)
4941	target_flags = `0`;
4942
4943	retry:
4944	error = filename_parentat(dfd: olddfd, name: from, flags: lookup_flags, parent: &old_path,
4945	last: &old_last, type: &old_type);
4946	if (error)
4947	goto put_names;
4948
4949	error = filename_parentat(dfd: newdfd, name: to, flags: lookup_flags, parent: &new_path, last: &new_last,
4950	type: &new_type);
4951	if (error)
4952	goto exit1;
4953
4954	error = -EXDEV;
4955	if (old_path.mnt != new_path.mnt)
4956	goto exit2;
4957
4958	error = -EBUSY;
4959	if (old_type != LAST_NORM)
4960	goto exit2;
4961
4962	if (flags & RENAME_NOREPLACE)
4963	error = -EEXIST;
4964	if (new_type != LAST_NORM)
4965	goto exit2;
4966
4967	error = mnt_want_write(mnt: old_path.mnt);
4968	if (error)
4969	goto exit2;
4970
4971	retry_deleg:
4972	trap = lock_rename(new_path.dentry, old_path.dentry);
4973	if (IS_ERR(ptr: trap)) {
4974	error = PTR_ERR(ptr: trap);
4975	goto exit_lock_rename;
4976	}
4977
4978	old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry,
4979	lookup_flags);
4980	error = PTR_ERR(ptr: old_dentry);
4981	if (IS_ERR(ptr: old_dentry))
4982	goto exit3;
4983	/ source must exist /
4984	error = -ENOENT;
4985	if (d_is_negative(dentry: old_dentry))
4986	goto exit4;
4987	new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry,
4988	lookup_flags \| target_flags);
4989	error = PTR_ERR(ptr: new_dentry);
4990	if (IS_ERR(ptr: new_dentry))
4991	goto exit4;
4992	error = -EEXIST;
4993	if ((flags & RENAME_NOREPLACE) && d_is_positive(dentry: new_dentry))
4994	goto exit5;
4995	if (flags & RENAME_EXCHANGE) {
4996	error = -ENOENT;
4997	if (d_is_negative(dentry: new_dentry))
4998	goto exit5;
4999
5000	if (!d_is_dir(dentry: new_dentry)) {
5001	error = -ENOTDIR;
5002	if (new_last.name[new_last.len])
5003	goto exit5;
5004	}
5005	}
5006	/ unless the source is a directory trailing slashes give -ENOTDIR /
5007	if (!d_is_dir(dentry: old_dentry)) {
5008	error = -ENOTDIR;
5009	if (old_last.name[old_last.len])
5010	goto exit5;
5011	if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
5012	goto exit5;
5013	}
5014	/ source should not be ancestor of target /
5015	error = -EINVAL;
5016	if (old_dentry == trap)
5017	goto exit5;
5018	/ target should not be an ancestor of source /
5019	if (!(flags & RENAME_EXCHANGE))
5020	error = -ENOTEMPTY;
5021	if (new_dentry == trap)
5022	goto exit5;
5023
5024	error = security_path_rename(old_dir: &old_path, old_dentry,
5025	new_dir: &new_path, new_dentry, flags);
5026	if (error)
5027	goto exit5;
5028
5029	rd.old_dir = old_path.dentry->d_inode;
5030	rd.old_dentry = old_dentry;
5031	rd.old_mnt_idmap = mnt_idmap(mnt: old_path.mnt);
5032	rd.new_dir = new_path.dentry->d_inode;
5033	rd.new_dentry = new_dentry;
5034	rd.new_mnt_idmap = mnt_idmap(mnt: new_path.mnt);
5035	rd.delegated_inode = &delegated_inode;
5036	rd.flags = flags;
5037	error = vfs_rename(&rd);
5038	exit5:
5039	dput(new_dentry);
5040	exit4:
5041	dput(old_dentry);
5042	exit3:
5043	unlock_rename(new_path.dentry, old_path.dentry);
5044	exit_lock_rename:
5045	if (delegated_inode) {
5046	error = break_deleg_wait(delegated_inode: &delegated_inode);
5047	if (!error)
5048	goto retry_deleg;
5049	}
5050	mnt_drop_write(mnt: old_path.mnt);
5051	exit2:
5052	if (retry_estale(error, flags: lookup_flags))
5053	should_retry = true;
5054	path_put(&new_path);
5055	exit1:
5056	path_put(&old_path);
5057	if (should_retry) {
5058	should_retry = false;
5059	lookup_flags \|= LOOKUP_REVAL;
5060	goto retry;
5061	}
5062	put_names:
5063	putname(from);
5064	putname(to);
5065	return error;
5066	}
5067
5068	SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
5069	int, newdfd, const char __user , newname, unsigned* int, flags)
5070	{
5071	return do_renameat2(olddfd, from: getname(filename: oldname), newdfd, to: getname(filename: newname),
5072	flags);
5073	}
5074
5075	SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
5076	int, newdfd, const char __user *, newname)
5077	{
5078	return do_renameat2(olddfd, from: getname(filename: oldname), newdfd, to: getname(filename: newname),
5079	flags: `0`);
5080	}
5081
5082	SYSCALL_DEFINE2(rename, const char __user , oldname, const* char __user *, newname)
5083	{
5084	return do_renameat2(AT_FDCWD, from: getname(filename: oldname), AT_FDCWD,
5085	to: getname(filename: newname), flags: `0`);
5086	}
5087
5088	int readlink_copy(char __user buffer, int* buflen, const char *link)
5089	{
5090	int len = PTR_ERR(ptr: link);
5091	if (IS_ERR(ptr: link))
5092	goto out;
5093
5094	len = strlen(link);
5095	if (len > (unsigned) buflen)
5096	len = buflen;
5097	if (copy_to_user(to: buffer, from: link, n: len))
5098	len = -EFAULT;
5099	out:
5100	return len;
5101	}
5102
5103	/**
5104	* vfs_readlink - copy symlink body into userspace buffer
5105	* @dentry: dentry on which to get symbolic link
5106	* @buffer: user memory pointer
5107	* @buflen: size of buffer
5108	*
5109	* Does not touch atime. That's up to the caller if necessary
5110	*
5111	* Does not call security hook.
5112	*/
5113	int vfs_readlink(struct dentry dentry, char* __user buffer, int* buflen)
5114	{
5115	struct inode *inode = d_inode(dentry);
5116	DEFINE_DELAYED_CALL(done);
5117	const char *link;
5118	int res;
5119
5120	if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
5121	if (unlikely(inode->i_op->readlink))
5122	return inode->i_op->readlink(dentry, buffer, buflen);
5123
5124	if (!d_is_symlink(dentry))
5125	return -EINVAL;
5126
5127	spin_lock(lock: &inode->i_lock);
5128	inode->i_opflags \|= IOP_DEFAULT_READLINK;
5129	spin_unlock(lock: &inode->i_lock);
5130	}
5131
5132	link = READ_ONCE(inode->i_link);
5133	if (!link) {
5134	link = inode->i_op->get_link(dentry, inode, &done);
5135	if (IS_ERR(ptr: link))
5136	return PTR_ERR(ptr: link);
5137	}
5138	res = readlink_copy(buffer, buflen, link);
5139	do_delayed_call(call: &done);
5140	return res;
5141	}
5142	EXPORT_SYMBOL(vfs_readlink);
5143
5144	/**
5145	* vfs_get_link - get symlink body
5146	* @dentry: dentry on which to get symbolic link
5147	* @done: caller needs to free returned data with this
5148	*
5149	* Calls security hook and i_op->get_link() on the supplied inode.
5150	*
5151	* It does not touch atime. That's up to the caller if necessary.
5152	*
5153	* Does not work on "special" symlinks like /proc/$$/fd/N
5154	*/
5155	const char vfs_get_link(struct* dentry dentry, struct* delayed_call *done)
5156	{
5157	const char *res = ERR_PTR(error: -EINVAL);
5158	struct inode *inode = d_inode(dentry);
5159
5160	if (d_is_symlink(dentry)) {
5161	res = ERR_PTR(error: security_inode_readlink(dentry));
5162	if (!res)
5163	res = inode->i_op->get_link(dentry, inode, done);
5164	}
5165	return res;
5166	}
5167	EXPORT_SYMBOL(vfs_get_link);
5168
5169	/ get the link contents into pagecache /
5170	const char page_get_link(struct* dentry dentry, struct* inode *inode,
5171	struct delayed_call *callback)
5172	{
5173	char *kaddr;
5174	struct page *page;
5175	struct address_space *mapping = inode->i_mapping;
5176
5177	if (!dentry) {
5178	page = find_get_page(mapping, offset: `0`);
5179	if (!page)
5180	return ERR_PTR(error: -ECHILD);
5181	if (!PageUptodate(page)) {
5182	put_page(page);
5183	return ERR_PTR(error: -ECHILD);
5184	}
5185	} else {
5186	page = read_mapping_page(mapping, index: `0`, NULL);
5187	if (IS_ERR(ptr: page))
5188	return (char*)page;
5189	}
5190	set_delayed_call(call: callback, fn: page_put_link, arg: page);
5191	BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
5192	kaddr = page_address(page);
5193	nd_terminate_link(name: kaddr, len: inode->i_size, PAGE_SIZE - `1`);
5194	return kaddr;
5195	}
5196
5197	EXPORT_SYMBOL(page_get_link);
5198
5199	void page_put_link(void *arg)
5200	{
5201	put_page(page: arg);
5202	}
5203	EXPORT_SYMBOL(page_put_link);
5204
5205	int page_readlink(struct dentry dentry, char* __user buffer, int* buflen)
5206	{
5207	DEFINE_DELAYED_CALL(done);
5208	int res = readlink_copy(buffer, buflen,
5209	link: page_get_link(dentry, d_inode(dentry),
5210	&done));
5211	do_delayed_call(call: &done);
5212	return res;
5213	}
5214	EXPORT_SYMBOL(page_readlink);
5215
5216	int page_symlink(struct inode inode, const* char symname, int* len)
5217	{
5218	struct address_space *mapping = inode->i_mapping;
5219	const struct address_space_operations *aops = mapping->a_ops;
5220	bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS);
5221	struct page *page;
5222	void *fsdata = NULL;
5223	int err;
5224	unsigned int flags;
5225
5226	retry:
5227	if (nofs)
5228	flags = memalloc_nofs_save();
5229	err = aops->write_begin(NULL, mapping, `0`, len-`1`, &page, &fsdata);
5230	if (nofs)
5231	memalloc_nofs_restore(flags);
5232	if (err)
5233	goto fail;
5234
5235	memcpy(page_address(page), symname, len-`1`);
5236
5237	err = aops->write_end(NULL, mapping, `0`, len-`1`, len-`1`,
5238	page, fsdata);
5239	if (err < `0`)
5240	goto fail;
5241	if (err < len-`1`)
5242	goto retry;
5243
5244	mark_inode_dirty(inode);
5245	return `0`;
5246	fail:
5247	return err;
5248	}
5249	EXPORT_SYMBOL(page_symlink);
5250
5251	const struct inode_operations page_symlink_inode_operations = {
5252	.get_link = page_get_link,
5253	};
5254	EXPORT_SYMBOL(page_symlink_inode_operations);
5255

source code of linux/fs/namei.c