1 | /* |
2 | * memfd_create system call and file sealing support |
3 | * |
4 | * Code was originally included in shmem.c, and broken out to facilitate |
5 | * use by hugetlbfs as well as tmpfs. |
6 | * |
7 | * This file is released under the GPL. |
8 | */ |
9 | |
10 | #include <linux/fs.h> |
11 | #include <linux/vfs.h> |
12 | #include <linux/pagemap.h> |
13 | #include <linux/file.h> |
14 | #include <linux/mm.h> |
15 | #include <linux/sched/signal.h> |
16 | #include <linux/khugepaged.h> |
17 | #include <linux/syscalls.h> |
18 | #include <linux/hugetlb.h> |
19 | #include <linux/shmem_fs.h> |
20 | #include <linux/memfd.h> |
21 | #include <uapi/linux/memfd.h> |
22 | |
23 | /* |
24 | * We need a tag: a new tag would expand every xa_node by 8 bytes, |
25 | * so reuse a tag which we firmly believe is never set or cleared on tmpfs |
26 | * or hugetlbfs because they are memory only filesystems. |
27 | */ |
28 | #define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE |
29 | #define LAST_SCAN 4 /* about 150ms max */ |
30 | |
31 | static void memfd_tag_pins(struct xa_state *xas) |
32 | { |
33 | struct page *page; |
34 | unsigned int tagged = 0; |
35 | |
36 | lru_add_drain(); |
37 | |
38 | xas_lock_irq(xas); |
39 | xas_for_each(xas, page, ULONG_MAX) { |
40 | if (xa_is_value(page)) |
41 | continue; |
42 | if (page_count(page) - page_mapcount(page) > 1) |
43 | xas_set_mark(xas, MEMFD_TAG_PINNED); |
44 | |
45 | if (++tagged % XA_CHECK_SCHED) |
46 | continue; |
47 | |
48 | xas_pause(xas); |
49 | xas_unlock_irq(xas); |
50 | cond_resched(); |
51 | xas_lock_irq(xas); |
52 | } |
53 | xas_unlock_irq(xas); |
54 | } |
55 | |
56 | /* |
57 | * Setting SEAL_WRITE requires us to verify there's no pending writer. However, |
58 | * via get_user_pages(), drivers might have some pending I/O without any active |
59 | * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages |
60 | * and see whether it has an elevated ref-count. If so, we tag them and wait for |
61 | * them to be dropped. |
62 | * The caller must guarantee that no new user will acquire writable references |
63 | * to those pages to avoid races. |
64 | */ |
65 | static int memfd_wait_for_pins(struct address_space *mapping) |
66 | { |
67 | XA_STATE(xas, &mapping->i_pages, 0); |
68 | struct page *page; |
69 | int error, scan; |
70 | |
71 | memfd_tag_pins(&xas); |
72 | |
73 | error = 0; |
74 | for (scan = 0; scan <= LAST_SCAN; scan++) { |
75 | unsigned int tagged = 0; |
76 | |
77 | if (!xas_marked(&xas, MEMFD_TAG_PINNED)) |
78 | break; |
79 | |
80 | if (!scan) |
81 | lru_add_drain_all(); |
82 | else if (schedule_timeout_killable((HZ << scan) / 200)) |
83 | scan = LAST_SCAN; |
84 | |
85 | xas_set(&xas, 0); |
86 | xas_lock_irq(&xas); |
87 | xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) { |
88 | bool clear = true; |
89 | if (xa_is_value(page)) |
90 | continue; |
91 | if (page_count(page) - page_mapcount(page) != 1) { |
92 | /* |
93 | * On the last scan, we clean up all those tags |
94 | * we inserted; but make a note that we still |
95 | * found pages pinned. |
96 | */ |
97 | if (scan == LAST_SCAN) |
98 | error = -EBUSY; |
99 | else |
100 | clear = false; |
101 | } |
102 | if (clear) |
103 | xas_clear_mark(&xas, MEMFD_TAG_PINNED); |
104 | if (++tagged % XA_CHECK_SCHED) |
105 | continue; |
106 | |
107 | xas_pause(&xas); |
108 | xas_unlock_irq(&xas); |
109 | cond_resched(); |
110 | xas_lock_irq(&xas); |
111 | } |
112 | xas_unlock_irq(&xas); |
113 | } |
114 | |
115 | return error; |
116 | } |
117 | |
118 | static unsigned int *memfd_file_seals_ptr(struct file *file) |
119 | { |
120 | if (shmem_file(file)) |
121 | return &SHMEM_I(file_inode(file))->seals; |
122 | |
123 | #ifdef CONFIG_HUGETLBFS |
124 | if (is_file_hugepages(file)) |
125 | return &HUGETLBFS_I(file_inode(file))->seals; |
126 | #endif |
127 | |
128 | return NULL; |
129 | } |
130 | |
131 | #define F_ALL_SEALS (F_SEAL_SEAL | \ |
132 | F_SEAL_SHRINK | \ |
133 | F_SEAL_GROW | \ |
134 | F_SEAL_WRITE | \ |
135 | F_SEAL_FUTURE_WRITE) |
136 | |
137 | static int memfd_add_seals(struct file *file, unsigned int seals) |
138 | { |
139 | struct inode *inode = file_inode(file); |
140 | unsigned int *file_seals; |
141 | int error; |
142 | |
143 | /* |
144 | * SEALING |
145 | * Sealing allows multiple parties to share a tmpfs or hugetlbfs file |
146 | * but restrict access to a specific subset of file operations. Seals |
147 | * can only be added, but never removed. This way, mutually untrusted |
148 | * parties can share common memory regions with a well-defined policy. |
149 | * A malicious peer can thus never perform unwanted operations on a |
150 | * shared object. |
151 | * |
152 | * Seals are only supported on special tmpfs or hugetlbfs files and |
153 | * always affect the whole underlying inode. Once a seal is set, it |
154 | * may prevent some kinds of access to the file. Currently, the |
155 | * following seals are defined: |
156 | * SEAL_SEAL: Prevent further seals from being set on this file |
157 | * SEAL_SHRINK: Prevent the file from shrinking |
158 | * SEAL_GROW: Prevent the file from growing |
159 | * SEAL_WRITE: Prevent write access to the file |
160 | * |
161 | * As we don't require any trust relationship between two parties, we |
162 | * must prevent seals from being removed. Therefore, sealing a file |
163 | * only adds a given set of seals to the file, it never touches |
164 | * existing seals. Furthermore, the "setting seals"-operation can be |
165 | * sealed itself, which basically prevents any further seal from being |
166 | * added. |
167 | * |
168 | * Semantics of sealing are only defined on volatile files. Only |
169 | * anonymous tmpfs and hugetlbfs files support sealing. More |
170 | * importantly, seals are never written to disk. Therefore, there's |
171 | * no plan to support it on other file types. |
172 | */ |
173 | |
174 | if (!(file->f_mode & FMODE_WRITE)) |
175 | return -EPERM; |
176 | if (seals & ~(unsigned int)F_ALL_SEALS) |
177 | return -EINVAL; |
178 | |
179 | inode_lock(inode); |
180 | |
181 | file_seals = memfd_file_seals_ptr(file); |
182 | if (!file_seals) { |
183 | error = -EINVAL; |
184 | goto unlock; |
185 | } |
186 | |
187 | if (*file_seals & F_SEAL_SEAL) { |
188 | error = -EPERM; |
189 | goto unlock; |
190 | } |
191 | |
192 | if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { |
193 | error = mapping_deny_writable(file->f_mapping); |
194 | if (error) |
195 | goto unlock; |
196 | |
197 | error = memfd_wait_for_pins(file->f_mapping); |
198 | if (error) { |
199 | mapping_allow_writable(file->f_mapping); |
200 | goto unlock; |
201 | } |
202 | } |
203 | |
204 | *file_seals |= seals; |
205 | error = 0; |
206 | |
207 | unlock: |
208 | inode_unlock(inode); |
209 | return error; |
210 | } |
211 | |
212 | static int memfd_get_seals(struct file *file) |
213 | { |
214 | unsigned int *seals = memfd_file_seals_ptr(file); |
215 | |
216 | return seals ? *seals : -EINVAL; |
217 | } |
218 | |
219 | long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) |
220 | { |
221 | long error; |
222 | |
223 | switch (cmd) { |
224 | case F_ADD_SEALS: |
225 | /* disallow upper 32bit */ |
226 | if (arg > UINT_MAX) |
227 | return -EINVAL; |
228 | |
229 | error = memfd_add_seals(file, arg); |
230 | break; |
231 | case F_GET_SEALS: |
232 | error = memfd_get_seals(file); |
233 | break; |
234 | default: |
235 | error = -EINVAL; |
236 | break; |
237 | } |
238 | |
239 | return error; |
240 | } |
241 | |
242 | #define MFD_NAME_PREFIX "memfd:" |
243 | #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) |
244 | #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) |
245 | |
246 | #define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) |
247 | |
248 | SYSCALL_DEFINE2(memfd_create, |
249 | const char __user *, uname, |
250 | unsigned int, flags) |
251 | { |
252 | unsigned int *file_seals; |
253 | struct file *file; |
254 | int fd, error; |
255 | char *name; |
256 | long len; |
257 | |
258 | if (!(flags & MFD_HUGETLB)) { |
259 | if (flags & ~(unsigned int)MFD_ALL_FLAGS) |
260 | return -EINVAL; |
261 | } else { |
262 | /* Allow huge page size encoding in flags. */ |
263 | if (flags & ~(unsigned int)(MFD_ALL_FLAGS | |
264 | (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) |
265 | return -EINVAL; |
266 | } |
267 | |
268 | /* length includes terminating zero */ |
269 | len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); |
270 | if (len <= 0) |
271 | return -EFAULT; |
272 | if (len > MFD_NAME_MAX_LEN + 1) |
273 | return -EINVAL; |
274 | |
275 | name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL); |
276 | if (!name) |
277 | return -ENOMEM; |
278 | |
279 | strcpy(name, MFD_NAME_PREFIX); |
280 | if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { |
281 | error = -EFAULT; |
282 | goto err_name; |
283 | } |
284 | |
285 | /* terminating-zero may have changed after strnlen_user() returned */ |
286 | if (name[len + MFD_NAME_PREFIX_LEN - 1]) { |
287 | error = -EFAULT; |
288 | goto err_name; |
289 | } |
290 | |
291 | fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); |
292 | if (fd < 0) { |
293 | error = fd; |
294 | goto err_name; |
295 | } |
296 | |
297 | if (flags & MFD_HUGETLB) { |
298 | struct user_struct *user = NULL; |
299 | |
300 | file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, |
301 | HUGETLB_ANONHUGE_INODE, |
302 | (flags >> MFD_HUGE_SHIFT) & |
303 | MFD_HUGE_MASK); |
304 | } else |
305 | file = shmem_file_setup(name, 0, VM_NORESERVE); |
306 | if (IS_ERR(file)) { |
307 | error = PTR_ERR(file); |
308 | goto err_fd; |
309 | } |
310 | file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; |
311 | file->f_flags |= O_LARGEFILE; |
312 | |
313 | if (flags & MFD_ALLOW_SEALING) { |
314 | file_seals = memfd_file_seals_ptr(file); |
315 | *file_seals &= ~F_SEAL_SEAL; |
316 | } |
317 | |
318 | fd_install(fd, file); |
319 | kfree(name); |
320 | return fd; |
321 | |
322 | err_fd: |
323 | put_unused_fd(fd); |
324 | err_name: |
325 | kfree(name); |
326 | return error; |
327 | } |
328 | |