1// SPDX-License-Identifier: GPL-2.0
2#include <linux/backing-dev.h>
3#include <linux/falloc.h>
4#include <linux/kvm_host.h>
5#include <linux/pagemap.h>
6#include <linux/anon_inodes.h>
7
8#include "kvm_mm.h"
9
10struct kvm_gmem {
11 struct kvm *kvm;
12 struct xarray bindings;
13 struct list_head entry;
14};
15
16static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
17{
18 struct folio *folio;
19
20 /* TODO: Support huge pages. */
21 folio = filemap_grab_folio(mapping: inode->i_mapping, index);
22 if (IS_ERR_OR_NULL(ptr: folio))
23 return NULL;
24
25 /*
26 * Use the up-to-date flag to track whether or not the memory has been
27 * zeroed before being handed off to the guest. There is no backing
28 * storage for the memory, so the folio will remain up-to-date until
29 * it's removed.
30 *
31 * TODO: Skip clearing pages when trusted firmware will do it when
32 * assigning memory to the guest.
33 */
34 if (!folio_test_uptodate(folio)) {
35 unsigned long nr_pages = folio_nr_pages(folio);
36 unsigned long i;
37
38 for (i = 0; i < nr_pages; i++)
39 clear_highpage(folio_page(folio, i));
40
41 folio_mark_uptodate(folio);
42 }
43
44 /*
45 * Ignore accessed, referenced, and dirty flags. The memory is
46 * unevictable and there is no storage to write back to.
47 */
48 return folio;
49}
50
51static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
52 pgoff_t end)
53{
54 bool flush = false, found_memslot = false;
55 struct kvm_memory_slot *slot;
56 struct kvm *kvm = gmem->kvm;
57 unsigned long index;
58
59 xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
60 pgoff_t pgoff = slot->gmem.pgoff;
61
62 struct kvm_gfn_range gfn_range = {
63 .start = slot->base_gfn + max(pgoff, start) - pgoff,
64 .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
65 .slot = slot,
66 .may_block = true,
67 };
68
69 if (!found_memslot) {
70 found_memslot = true;
71
72 KVM_MMU_LOCK(kvm);
73 kvm_mmu_invalidate_begin(kvm);
74 }
75
76 flush |= kvm_mmu_unmap_gfn_range(kvm, range: &gfn_range);
77 }
78
79 if (flush)
80 kvm_flush_remote_tlbs(kvm);
81
82 if (found_memslot)
83 KVM_MMU_UNLOCK(kvm);
84}
85
86static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
87 pgoff_t end)
88{
89 struct kvm *kvm = gmem->kvm;
90
91 if (xa_find(xa: &gmem->bindings, index: &start, max: end - 1, XA_PRESENT)) {
92 KVM_MMU_LOCK(kvm);
93 kvm_mmu_invalidate_end(kvm);
94 KVM_MMU_UNLOCK(kvm);
95 }
96}
97
98static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
99{
100 struct list_head *gmem_list = &inode->i_mapping->i_private_list;
101 pgoff_t start = offset >> PAGE_SHIFT;
102 pgoff_t end = (offset + len) >> PAGE_SHIFT;
103 struct kvm_gmem *gmem;
104
105 /*
106 * Bindings must be stable across invalidation to ensure the start+end
107 * are balanced.
108 */
109 filemap_invalidate_lock(mapping: inode->i_mapping);
110
111 list_for_each_entry(gmem, gmem_list, entry)
112 kvm_gmem_invalidate_begin(gmem, start, end);
113
114 truncate_inode_pages_range(inode->i_mapping, lstart: offset, lend: offset + len - 1);
115
116 list_for_each_entry(gmem, gmem_list, entry)
117 kvm_gmem_invalidate_end(gmem, start, end);
118
119 filemap_invalidate_unlock(mapping: inode->i_mapping);
120
121 return 0;
122}
123
124static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
125{
126 struct address_space *mapping = inode->i_mapping;
127 pgoff_t start, index, end;
128 int r;
129
130 /* Dedicated guest is immutable by default. */
131 if (offset + len > i_size_read(inode))
132 return -EINVAL;
133
134 filemap_invalidate_lock_shared(mapping);
135
136 start = offset >> PAGE_SHIFT;
137 end = (offset + len) >> PAGE_SHIFT;
138
139 r = 0;
140 for (index = start; index < end; ) {
141 struct folio *folio;
142
143 if (signal_pending(current)) {
144 r = -EINTR;
145 break;
146 }
147
148 folio = kvm_gmem_get_folio(inode, index);
149 if (!folio) {
150 r = -ENOMEM;
151 break;
152 }
153
154 index = folio_next_index(folio);
155
156 folio_unlock(folio);
157 folio_put(folio);
158
159 /* 64-bit only, wrapping the index should be impossible. */
160 if (WARN_ON_ONCE(!index))
161 break;
162
163 cond_resched();
164 }
165
166 filemap_invalidate_unlock_shared(mapping);
167
168 return r;
169}
170
171static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
172 loff_t len)
173{
174 int ret;
175
176 if (!(mode & FALLOC_FL_KEEP_SIZE))
177 return -EOPNOTSUPP;
178
179 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
180 return -EOPNOTSUPP;
181
182 if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
183 return -EINVAL;
184
185 if (mode & FALLOC_FL_PUNCH_HOLE)
186 ret = kvm_gmem_punch_hole(inode: file_inode(f: file), offset, len);
187 else
188 ret = kvm_gmem_allocate(inode: file_inode(f: file), offset, len);
189
190 if (!ret)
191 file_modified(file);
192 return ret;
193}
194
195static int kvm_gmem_release(struct inode *inode, struct file *file)
196{
197 struct kvm_gmem *gmem = file->private_data;
198 struct kvm_memory_slot *slot;
199 struct kvm *kvm = gmem->kvm;
200 unsigned long index;
201
202 /*
203 * Prevent concurrent attempts to *unbind* a memslot. This is the last
204 * reference to the file and thus no new bindings can be created, but
205 * dereferencing the slot for existing bindings needs to be protected
206 * against memslot updates, specifically so that unbind doesn't race
207 * and free the memslot (kvm_gmem_get_file() will return NULL).
208 */
209 mutex_lock(&kvm->slots_lock);
210
211 filemap_invalidate_lock(mapping: inode->i_mapping);
212
213 xa_for_each(&gmem->bindings, index, slot)
214 rcu_assign_pointer(slot->gmem.file, NULL);
215
216 synchronize_rcu();
217
218 /*
219 * All in-flight operations are gone and new bindings can be created.
220 * Zap all SPTEs pointed at by this file. Do not free the backing
221 * memory, as its lifetime is associated with the inode, not the file.
222 */
223 kvm_gmem_invalidate_begin(gmem, start: 0, end: -1ul);
224 kvm_gmem_invalidate_end(gmem, start: 0, end: -1ul);
225
226 list_del(entry: &gmem->entry);
227
228 filemap_invalidate_unlock(mapping: inode->i_mapping);
229
230 mutex_unlock(lock: &kvm->slots_lock);
231
232 xa_destroy(&gmem->bindings);
233 kfree(objp: gmem);
234
235 kvm_put_kvm(kvm);
236
237 return 0;
238}
239
240static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
241{
242 /*
243 * Do not return slot->gmem.file if it has already been closed;
244 * there might be some time between the last fput() and when
245 * kvm_gmem_release() clears slot->gmem.file, and you do not
246 * want to spin in the meanwhile.
247 */
248 return get_file_active(f: &slot->gmem.file);
249}
250
251static struct file_operations kvm_gmem_fops = {
252 .open = generic_file_open,
253 .release = kvm_gmem_release,
254 .fallocate = kvm_gmem_fallocate,
255};
256
257void kvm_gmem_init(struct module *module)
258{
259 kvm_gmem_fops.owner = module;
260}
261
262static int kvm_gmem_migrate_folio(struct address_space *mapping,
263 struct folio *dst, struct folio *src,
264 enum migrate_mode mode)
265{
266 WARN_ON_ONCE(1);
267 return -EINVAL;
268}
269
270static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio)
271{
272 struct list_head *gmem_list = &mapping->i_private_list;
273 struct kvm_gmem *gmem;
274 pgoff_t start, end;
275
276 filemap_invalidate_lock_shared(mapping);
277
278 start = folio->index;
279 end = start + folio_nr_pages(folio);
280
281 list_for_each_entry(gmem, gmem_list, entry)
282 kvm_gmem_invalidate_begin(gmem, start, end);
283
284 /*
285 * Do not truncate the range, what action is taken in response to the
286 * error is userspace's decision (assuming the architecture supports
287 * gracefully handling memory errors). If/when the guest attempts to
288 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
289 * at which point KVM can either terminate the VM or propagate the
290 * error to userspace.
291 */
292
293 list_for_each_entry(gmem, gmem_list, entry)
294 kvm_gmem_invalidate_end(gmem, start, end);
295
296 filemap_invalidate_unlock_shared(mapping);
297
298 return MF_DELAYED;
299}
300
301static const struct address_space_operations kvm_gmem_aops = {
302 .dirty_folio = noop_dirty_folio,
303 .migrate_folio = kvm_gmem_migrate_folio,
304 .error_remove_folio = kvm_gmem_error_folio,
305};
306
307static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path,
308 struct kstat *stat, u32 request_mask,
309 unsigned int query_flags)
310{
311 struct inode *inode = path->dentry->d_inode;
312
313 generic_fillattr(idmap, request_mask, inode, stat);
314 return 0;
315}
316
317static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
318 struct iattr *attr)
319{
320 return -EINVAL;
321}
322static const struct inode_operations kvm_gmem_iops = {
323 .getattr = kvm_gmem_getattr,
324 .setattr = kvm_gmem_setattr,
325};
326
327static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
328{
329 const char *anon_name = "[kvm-gmem]";
330 struct kvm_gmem *gmem;
331 struct inode *inode;
332 struct file *file;
333 int fd, err;
334
335 fd = get_unused_fd_flags(flags: 0);
336 if (fd < 0)
337 return fd;
338
339 gmem = kzalloc(size: sizeof(*gmem), GFP_KERNEL);
340 if (!gmem) {
341 err = -ENOMEM;
342 goto err_fd;
343 }
344
345 file = anon_inode_create_getfile(name: anon_name, fops: &kvm_gmem_fops, priv: gmem,
346 O_RDWR, NULL);
347 if (IS_ERR(ptr: file)) {
348 err = PTR_ERR(ptr: file);
349 goto err_gmem;
350 }
351
352 file->f_flags |= O_LARGEFILE;
353
354 inode = file->f_inode;
355 WARN_ON(file->f_mapping != inode->i_mapping);
356
357 inode->i_private = (void *)(unsigned long)flags;
358 inode->i_op = &kvm_gmem_iops;
359 inode->i_mapping->a_ops = &kvm_gmem_aops;
360 inode->i_mode |= S_IFREG;
361 inode->i_size = size;
362 mapping_set_gfp_mask(m: inode->i_mapping, GFP_HIGHUSER);
363 mapping_set_unmovable(mapping: inode->i_mapping);
364 /* Unmovable mappings are supposed to be marked unevictable as well. */
365 WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
366
367 kvm_get_kvm(kvm);
368 gmem->kvm = kvm;
369 xa_init(xa: &gmem->bindings);
370 list_add(new: &gmem->entry, head: &inode->i_mapping->i_private_list);
371
372 fd_install(fd, file);
373 return fd;
374
375err_gmem:
376 kfree(objp: gmem);
377err_fd:
378 put_unused_fd(fd);
379 return err;
380}
381
382int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
383{
384 loff_t size = args->size;
385 u64 flags = args->flags;
386 u64 valid_flags = 0;
387
388 if (flags & ~valid_flags)
389 return -EINVAL;
390
391 if (size <= 0 || !PAGE_ALIGNED(size))
392 return -EINVAL;
393
394 return __kvm_gmem_create(kvm, size, flags);
395}
396
397int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
398 unsigned int fd, loff_t offset)
399{
400 loff_t size = slot->npages << PAGE_SHIFT;
401 unsigned long start, end;
402 struct kvm_gmem *gmem;
403 struct inode *inode;
404 struct file *file;
405 int r = -EINVAL;
406
407 BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
408
409 file = fget(fd);
410 if (!file)
411 return -EBADF;
412
413 if (file->f_op != &kvm_gmem_fops)
414 goto err;
415
416 gmem = file->private_data;
417 if (gmem->kvm != kvm)
418 goto err;
419
420 inode = file_inode(f: file);
421
422 if (offset < 0 || !PAGE_ALIGNED(offset) ||
423 offset + size > i_size_read(inode))
424 goto err;
425
426 filemap_invalidate_lock(mapping: inode->i_mapping);
427
428 start = offset >> PAGE_SHIFT;
429 end = start + slot->npages;
430
431 if (!xa_empty(xa: &gmem->bindings) &&
432 xa_find(xa: &gmem->bindings, index: &start, max: end - 1, XA_PRESENT)) {
433 filemap_invalidate_unlock(mapping: inode->i_mapping);
434 goto err;
435 }
436
437 /*
438 * No synchronize_rcu() needed, any in-flight readers are guaranteed to
439 * be see either a NULL file or this new file, no need for them to go
440 * away.
441 */
442 rcu_assign_pointer(slot->gmem.file, file);
443 slot->gmem.pgoff = start;
444
445 xa_store_range(&gmem->bindings, first: start, last: end - 1, entry: slot, GFP_KERNEL);
446 filemap_invalidate_unlock(mapping: inode->i_mapping);
447
448 /*
449 * Drop the reference to the file, even on success. The file pins KVM,
450 * not the other way 'round. Active bindings are invalidated if the
451 * file is closed before memslots are destroyed.
452 */
453 r = 0;
454err:
455 fput(file);
456 return r;
457}
458
459void kvm_gmem_unbind(struct kvm_memory_slot *slot)
460{
461 unsigned long start = slot->gmem.pgoff;
462 unsigned long end = start + slot->npages;
463 struct kvm_gmem *gmem;
464 struct file *file;
465
466 /*
467 * Nothing to do if the underlying file was already closed (or is being
468 * closed right now), kvm_gmem_release() invalidates all bindings.
469 */
470 file = kvm_gmem_get_file(slot);
471 if (!file)
472 return;
473
474 gmem = file->private_data;
475
476 filemap_invalidate_lock(mapping: file->f_mapping);
477 xa_store_range(&gmem->bindings, first: start, last: end - 1, NULL, GFP_KERNEL);
478 rcu_assign_pointer(slot->gmem.file, NULL);
479 synchronize_rcu();
480 filemap_invalidate_unlock(mapping: file->f_mapping);
481
482 fput(file);
483}
484
485int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
486 gfn_t gfn, kvm_pfn_t *pfn, int *max_order)
487{
488 pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
489 struct kvm_gmem *gmem;
490 struct folio *folio;
491 struct page *page;
492 struct file *file;
493 int r;
494
495 file = kvm_gmem_get_file(slot);
496 if (!file)
497 return -EFAULT;
498
499 gmem = file->private_data;
500
501 if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) {
502 r = -EIO;
503 goto out_fput;
504 }
505
506 folio = kvm_gmem_get_folio(inode: file_inode(f: file), index);
507 if (!folio) {
508 r = -ENOMEM;
509 goto out_fput;
510 }
511
512 if (folio_test_hwpoison(folio)) {
513 r = -EHWPOISON;
514 goto out_unlock;
515 }
516
517 page = folio_file_page(folio, index);
518
519 *pfn = page_to_pfn(page);
520 if (max_order)
521 *max_order = 0;
522
523 r = 0;
524
525out_unlock:
526 folio_unlock(folio);
527out_fput:
528 fput(file);
529
530 return r;
531}
532EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
533

source code of linux/virt/kvm/guest_memfd.c