1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include <linux/anon_inodes.h> |
3 | #include <linux/file.h> |
4 | #include <linux/fs.h> |
5 | #include <linux/magic.h> |
6 | #include <linux/mount.h> |
7 | #include <linux/pid.h> |
8 | #include <linux/pidfs.h> |
9 | #include <linux/pid_namespace.h> |
10 | #include <linux/poll.h> |
11 | #include <linux/proc_fs.h> |
12 | #include <linux/proc_ns.h> |
13 | #include <linux/pseudo_fs.h> |
14 | #include <linux/seq_file.h> |
15 | #include <uapi/linux/pidfd.h> |
16 | |
17 | #include "internal.h" |
18 | |
19 | #ifdef CONFIG_PROC_FS |
20 | /** |
21 | * pidfd_show_fdinfo - print information about a pidfd |
22 | * @m: proc fdinfo file |
23 | * @f: file referencing a pidfd |
24 | * |
25 | * Pid: |
26 | * This function will print the pid that a given pidfd refers to in the |
27 | * pid namespace of the procfs instance. |
28 | * If the pid namespace of the process is not a descendant of the pid |
29 | * namespace of the procfs instance 0 will be shown as its pid. This is |
30 | * similar to calling getppid() on a process whose parent is outside of |
31 | * its pid namespace. |
32 | * |
33 | * NSpid: |
34 | * If pid namespaces are supported then this function will also print |
35 | * the pid of a given pidfd refers to for all descendant pid namespaces |
36 | * starting from the current pid namespace of the instance, i.e. the |
37 | * Pid field and the first entry in the NSpid field will be identical. |
38 | * If the pid namespace of the process is not a descendant of the pid |
39 | * namespace of the procfs instance 0 will be shown as its first NSpid |
40 | * entry and no others will be shown. |
41 | * Note that this differs from the Pid and NSpid fields in |
42 | * /proc/<pid>/status where Pid and NSpid are always shown relative to |
43 | * the pid namespace of the procfs instance. The difference becomes |
44 | * obvious when sending around a pidfd between pid namespaces from a |
45 | * different branch of the tree, i.e. where no ancestral relation is |
46 | * present between the pid namespaces: |
47 | * - create two new pid namespaces ns1 and ns2 in the initial pid |
48 | * namespace (also take care to create new mount namespaces in the |
49 | * new pid namespace and mount procfs) |
50 | * - create a process with a pidfd in ns1 |
51 | * - send pidfd from ns1 to ns2 |
52 | * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid |
53 | * have exactly one entry, which is 0 |
54 | */ |
55 | static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) |
56 | { |
57 | struct pid *pid = pidfd_pid(file: f); |
58 | struct pid_namespace *ns; |
59 | pid_t nr = -1; |
60 | |
61 | if (likely(pid_has_task(pid, PIDTYPE_PID))) { |
62 | ns = proc_pid_ns(sb: file_inode(f: m->file)->i_sb); |
63 | nr = pid_nr_ns(pid, ns); |
64 | } |
65 | |
66 | seq_put_decimal_ll(m, delimiter: "Pid:\t" , num: nr); |
67 | |
68 | #ifdef CONFIG_PID_NS |
69 | seq_put_decimal_ll(m, delimiter: "\nNSpid:\t" , num: nr); |
70 | if (nr > 0) { |
71 | int i; |
72 | |
73 | /* If nr is non-zero it means that 'pid' is valid and that |
74 | * ns, i.e. the pid namespace associated with the procfs |
75 | * instance, is in the pid namespace hierarchy of pid. |
76 | * Start at one below the already printed level. |
77 | */ |
78 | for (i = ns->level + 1; i <= pid->level; i++) |
79 | seq_put_decimal_ll(m, delimiter: "\t" , num: pid->numbers[i].nr); |
80 | } |
81 | #endif |
82 | seq_putc(m, c: '\n'); |
83 | } |
84 | #endif |
85 | |
86 | /* |
87 | * Poll support for process exit notification. |
88 | */ |
89 | static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) |
90 | { |
91 | struct pid *pid = pidfd_pid(file); |
92 | bool thread = file->f_flags & PIDFD_THREAD; |
93 | struct task_struct *task; |
94 | __poll_t poll_flags = 0; |
95 | |
96 | poll_wait(filp: file, wait_address: &pid->wait_pidfd, p: pts); |
97 | /* |
98 | * Depending on PIDFD_THREAD, inform pollers when the thread |
99 | * or the whole thread-group exits. |
100 | */ |
101 | guard(rcu)(); |
102 | task = pid_task(pid, PIDTYPE_PID); |
103 | if (!task) |
104 | poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP; |
105 | else if (task->exit_state && (thread || thread_group_empty(p: task))) |
106 | poll_flags = EPOLLIN | EPOLLRDNORM; |
107 | |
108 | return poll_flags; |
109 | } |
110 | |
111 | static const struct file_operations pidfs_file_operations = { |
112 | .poll = pidfd_poll, |
113 | #ifdef CONFIG_PROC_FS |
114 | .show_fdinfo = pidfd_show_fdinfo, |
115 | #endif |
116 | }; |
117 | |
118 | struct pid *pidfd_pid(const struct file *file) |
119 | { |
120 | if (file->f_op != &pidfs_file_operations) |
121 | return ERR_PTR(error: -EBADF); |
122 | return file_inode(f: file)->i_private; |
123 | } |
124 | |
125 | static struct vfsmount *pidfs_mnt __ro_after_init; |
126 | |
127 | #if BITS_PER_LONG == 32 |
128 | /* |
129 | * Provide a fallback mechanism for 32-bit systems so processes remain |
130 | * reliably comparable by inode number even on those systems. |
131 | */ |
132 | static DEFINE_IDA(pidfd_inum_ida); |
133 | |
134 | static int pidfs_inum(struct pid *pid, unsigned long *ino) |
135 | { |
136 | int ret; |
137 | |
138 | ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1, |
139 | UINT_MAX, GFP_ATOMIC); |
140 | if (ret < 0) |
141 | return -ENOSPC; |
142 | |
143 | *ino = ret; |
144 | return 0; |
145 | } |
146 | |
147 | static inline void pidfs_free_inum(unsigned long ino) |
148 | { |
149 | if (ino > 0) |
150 | ida_free(&pidfd_inum_ida, ino); |
151 | } |
152 | #else |
153 | static inline int pidfs_inum(struct pid *pid, unsigned long *ino) |
154 | { |
155 | *ino = pid->ino; |
156 | return 0; |
157 | } |
158 | #define pidfs_free_inum(ino) ((void)(ino)) |
159 | #endif |
160 | |
161 | /* |
162 | * The vfs falls back to simple_setattr() if i_op->setattr() isn't |
163 | * implemented. Let's reject it completely until we have a clean |
164 | * permission concept for pidfds. |
165 | */ |
166 | static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, |
167 | struct iattr *attr) |
168 | { |
169 | return -EOPNOTSUPP; |
170 | } |
171 | |
172 | static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, |
173 | struct kstat *stat, u32 request_mask, |
174 | unsigned int query_flags) |
175 | { |
176 | struct inode *inode = d_inode(dentry: path->dentry); |
177 | |
178 | generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); |
179 | return 0; |
180 | } |
181 | |
182 | static const struct inode_operations pidfs_inode_operations = { |
183 | .getattr = pidfs_getattr, |
184 | .setattr = pidfs_setattr, |
185 | }; |
186 | |
187 | static void pidfs_evict_inode(struct inode *inode) |
188 | { |
189 | struct pid *pid = inode->i_private; |
190 | |
191 | clear_inode(inode); |
192 | put_pid(pid); |
193 | pidfs_free_inum(inode->i_ino); |
194 | } |
195 | |
196 | static const struct super_operations pidfs_sops = { |
197 | .drop_inode = generic_delete_inode, |
198 | .evict_inode = pidfs_evict_inode, |
199 | .statfs = simple_statfs, |
200 | }; |
201 | |
202 | static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) |
203 | { |
204 | struct inode *inode = d_inode(dentry); |
205 | struct pid *pid = inode->i_private; |
206 | |
207 | return dynamic_dname(buffer, buflen, "pidfd:[%llu]" , pid->ino); |
208 | } |
209 | |
210 | static const struct dentry_operations pidfs_dentry_operations = { |
211 | .d_delete = always_delete_dentry, |
212 | .d_dname = pidfs_dname, |
213 | .d_prune = stashed_dentry_prune, |
214 | }; |
215 | |
216 | static int pidfs_init_inode(struct inode *inode, void *data) |
217 | { |
218 | inode->i_private = data; |
219 | inode->i_flags |= S_PRIVATE; |
220 | inode->i_mode |= S_IRWXU; |
221 | inode->i_op = &pidfs_inode_operations; |
222 | inode->i_fop = &pidfs_file_operations; |
223 | /* |
224 | * Inode numbering for pidfs start at RESERVED_PIDS + 1. This |
225 | * avoids collisions with the root inode which is 1 for pseudo |
226 | * filesystems. |
227 | */ |
228 | return pidfs_inum(pid: data, ino: &inode->i_ino); |
229 | } |
230 | |
231 | static void pidfs_put_data(void *data) |
232 | { |
233 | struct pid *pid = data; |
234 | put_pid(pid); |
235 | } |
236 | |
237 | static const struct stashed_operations pidfs_stashed_ops = { |
238 | .init_inode = pidfs_init_inode, |
239 | .put_data = pidfs_put_data, |
240 | }; |
241 | |
242 | static int pidfs_init_fs_context(struct fs_context *fc) |
243 | { |
244 | struct pseudo_fs_context *ctx; |
245 | |
246 | ctx = init_pseudo(fc, PID_FS_MAGIC); |
247 | if (!ctx) |
248 | return -ENOMEM; |
249 | |
250 | ctx->ops = &pidfs_sops; |
251 | ctx->dops = &pidfs_dentry_operations; |
252 | fc->s_fs_info = (void *)&pidfs_stashed_ops; |
253 | return 0; |
254 | } |
255 | |
256 | static struct file_system_type pidfs_type = { |
257 | .name = "pidfs" , |
258 | .init_fs_context = pidfs_init_fs_context, |
259 | .kill_sb = kill_anon_super, |
260 | }; |
261 | |
262 | struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) |
263 | { |
264 | |
265 | struct file *pidfd_file; |
266 | struct path path; |
267 | int ret; |
268 | |
269 | ret = path_from_stashed(stashed: &pid->stashed, mnt: pidfs_mnt, data: get_pid(pid), path: &path); |
270 | if (ret < 0) |
271 | return ERR_PTR(error: ret); |
272 | |
273 | pidfd_file = dentry_open(path: &path, flags, current_cred()); |
274 | path_put(&path); |
275 | return pidfd_file; |
276 | } |
277 | |
278 | void __init pidfs_init(void) |
279 | { |
280 | pidfs_mnt = kern_mount(&pidfs_type); |
281 | if (IS_ERR(ptr: pidfs_mnt)) |
282 | panic(fmt: "Failed to mount pidfs pseudo filesystem" ); |
283 | } |
284 | |