1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include <linux/backing-dev.h> |
3 | #include <linux/falloc.h> |
4 | #include <linux/kvm_host.h> |
5 | #include <linux/pagemap.h> |
6 | #include <linux/anon_inodes.h> |
7 | |
8 | #include "kvm_mm.h" |
9 | |
10 | struct kvm_gmem { |
11 | struct kvm *kvm; |
12 | struct xarray bindings; |
13 | struct list_head entry; |
14 | }; |
15 | |
16 | static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) |
17 | { |
18 | struct folio *folio; |
19 | |
20 | /* TODO: Support huge pages. */ |
21 | folio = filemap_grab_folio(mapping: inode->i_mapping, index); |
22 | if (IS_ERR_OR_NULL(ptr: folio)) |
23 | return NULL; |
24 | |
25 | /* |
26 | * Use the up-to-date flag to track whether or not the memory has been |
27 | * zeroed before being handed off to the guest. There is no backing |
28 | * storage for the memory, so the folio will remain up-to-date until |
29 | * it's removed. |
30 | * |
31 | * TODO: Skip clearing pages when trusted firmware will do it when |
32 | * assigning memory to the guest. |
33 | */ |
34 | if (!folio_test_uptodate(folio)) { |
35 | unsigned long nr_pages = folio_nr_pages(folio); |
36 | unsigned long i; |
37 | |
38 | for (i = 0; i < nr_pages; i++) |
39 | clear_highpage(folio_page(folio, i)); |
40 | |
41 | folio_mark_uptodate(folio); |
42 | } |
43 | |
44 | /* |
45 | * Ignore accessed, referenced, and dirty flags. The memory is |
46 | * unevictable and there is no storage to write back to. |
47 | */ |
48 | return folio; |
49 | } |
50 | |
51 | static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, |
52 | pgoff_t end) |
53 | { |
54 | bool flush = false, found_memslot = false; |
55 | struct kvm_memory_slot *slot; |
56 | struct kvm *kvm = gmem->kvm; |
57 | unsigned long index; |
58 | |
59 | xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) { |
60 | pgoff_t pgoff = slot->gmem.pgoff; |
61 | |
62 | struct kvm_gfn_range gfn_range = { |
63 | .start = slot->base_gfn + max(pgoff, start) - pgoff, |
64 | .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff, |
65 | .slot = slot, |
66 | .may_block = true, |
67 | }; |
68 | |
69 | if (!found_memslot) { |
70 | found_memslot = true; |
71 | |
72 | KVM_MMU_LOCK(kvm); |
73 | kvm_mmu_invalidate_begin(kvm); |
74 | } |
75 | |
76 | flush |= kvm_mmu_unmap_gfn_range(kvm, range: &gfn_range); |
77 | } |
78 | |
79 | if (flush) |
80 | kvm_flush_remote_tlbs(kvm); |
81 | |
82 | if (found_memslot) |
83 | KVM_MMU_UNLOCK(kvm); |
84 | } |
85 | |
86 | static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start, |
87 | pgoff_t end) |
88 | { |
89 | struct kvm *kvm = gmem->kvm; |
90 | |
91 | if (xa_find(xa: &gmem->bindings, index: &start, max: end - 1, XA_PRESENT)) { |
92 | KVM_MMU_LOCK(kvm); |
93 | kvm_mmu_invalidate_end(kvm); |
94 | KVM_MMU_UNLOCK(kvm); |
95 | } |
96 | } |
97 | |
98 | static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) |
99 | { |
100 | struct list_head *gmem_list = &inode->i_mapping->i_private_list; |
101 | pgoff_t start = offset >> PAGE_SHIFT; |
102 | pgoff_t end = (offset + len) >> PAGE_SHIFT; |
103 | struct kvm_gmem *gmem; |
104 | |
105 | /* |
106 | * Bindings must be stable across invalidation to ensure the start+end |
107 | * are balanced. |
108 | */ |
109 | filemap_invalidate_lock(mapping: inode->i_mapping); |
110 | |
111 | list_for_each_entry(gmem, gmem_list, entry) |
112 | kvm_gmem_invalidate_begin(gmem, start, end); |
113 | |
114 | truncate_inode_pages_range(inode->i_mapping, lstart: offset, lend: offset + len - 1); |
115 | |
116 | list_for_each_entry(gmem, gmem_list, entry) |
117 | kvm_gmem_invalidate_end(gmem, start, end); |
118 | |
119 | filemap_invalidate_unlock(mapping: inode->i_mapping); |
120 | |
121 | return 0; |
122 | } |
123 | |
124 | static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) |
125 | { |
126 | struct address_space *mapping = inode->i_mapping; |
127 | pgoff_t start, index, end; |
128 | int r; |
129 | |
130 | /* Dedicated guest is immutable by default. */ |
131 | if (offset + len > i_size_read(inode)) |
132 | return -EINVAL; |
133 | |
134 | filemap_invalidate_lock_shared(mapping); |
135 | |
136 | start = offset >> PAGE_SHIFT; |
137 | end = (offset + len) >> PAGE_SHIFT; |
138 | |
139 | r = 0; |
140 | for (index = start; index < end; ) { |
141 | struct folio *folio; |
142 | |
143 | if (signal_pending(current)) { |
144 | r = -EINTR; |
145 | break; |
146 | } |
147 | |
148 | folio = kvm_gmem_get_folio(inode, index); |
149 | if (!folio) { |
150 | r = -ENOMEM; |
151 | break; |
152 | } |
153 | |
154 | index = folio_next_index(folio); |
155 | |
156 | folio_unlock(folio); |
157 | folio_put(folio); |
158 | |
159 | /* 64-bit only, wrapping the index should be impossible. */ |
160 | if (WARN_ON_ONCE(!index)) |
161 | break; |
162 | |
163 | cond_resched(); |
164 | } |
165 | |
166 | filemap_invalidate_unlock_shared(mapping); |
167 | |
168 | return r; |
169 | } |
170 | |
171 | static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset, |
172 | loff_t len) |
173 | { |
174 | int ret; |
175 | |
176 | if (!(mode & FALLOC_FL_KEEP_SIZE)) |
177 | return -EOPNOTSUPP; |
178 | |
179 | if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) |
180 | return -EOPNOTSUPP; |
181 | |
182 | if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) |
183 | return -EINVAL; |
184 | |
185 | if (mode & FALLOC_FL_PUNCH_HOLE) |
186 | ret = kvm_gmem_punch_hole(inode: file_inode(f: file), offset, len); |
187 | else |
188 | ret = kvm_gmem_allocate(inode: file_inode(f: file), offset, len); |
189 | |
190 | if (!ret) |
191 | file_modified(file); |
192 | return ret; |
193 | } |
194 | |
195 | static int kvm_gmem_release(struct inode *inode, struct file *file) |
196 | { |
197 | struct kvm_gmem *gmem = file->private_data; |
198 | struct kvm_memory_slot *slot; |
199 | struct kvm *kvm = gmem->kvm; |
200 | unsigned long index; |
201 | |
202 | /* |
203 | * Prevent concurrent attempts to *unbind* a memslot. This is the last |
204 | * reference to the file and thus no new bindings can be created, but |
205 | * dereferencing the slot for existing bindings needs to be protected |
206 | * against memslot updates, specifically so that unbind doesn't race |
207 | * and free the memslot (kvm_gmem_get_file() will return NULL). |
208 | */ |
209 | mutex_lock(&kvm->slots_lock); |
210 | |
211 | filemap_invalidate_lock(mapping: inode->i_mapping); |
212 | |
213 | xa_for_each(&gmem->bindings, index, slot) |
214 | rcu_assign_pointer(slot->gmem.file, NULL); |
215 | |
216 | synchronize_rcu(); |
217 | |
218 | /* |
219 | * All in-flight operations are gone and new bindings can be created. |
220 | * Zap all SPTEs pointed at by this file. Do not free the backing |
221 | * memory, as its lifetime is associated with the inode, not the file. |
222 | */ |
223 | kvm_gmem_invalidate_begin(gmem, start: 0, end: -1ul); |
224 | kvm_gmem_invalidate_end(gmem, start: 0, end: -1ul); |
225 | |
226 | list_del(entry: &gmem->entry); |
227 | |
228 | filemap_invalidate_unlock(mapping: inode->i_mapping); |
229 | |
230 | mutex_unlock(lock: &kvm->slots_lock); |
231 | |
232 | xa_destroy(&gmem->bindings); |
233 | kfree(objp: gmem); |
234 | |
235 | kvm_put_kvm(kvm); |
236 | |
237 | return 0; |
238 | } |
239 | |
240 | static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) |
241 | { |
242 | /* |
243 | * Do not return slot->gmem.file if it has already been closed; |
244 | * there might be some time between the last fput() and when |
245 | * kvm_gmem_release() clears slot->gmem.file, and you do not |
246 | * want to spin in the meanwhile. |
247 | */ |
248 | return get_file_active(f: &slot->gmem.file); |
249 | } |
250 | |
251 | static struct file_operations kvm_gmem_fops = { |
252 | .open = generic_file_open, |
253 | .release = kvm_gmem_release, |
254 | .fallocate = kvm_gmem_fallocate, |
255 | }; |
256 | |
257 | void kvm_gmem_init(struct module *module) |
258 | { |
259 | kvm_gmem_fops.owner = module; |
260 | } |
261 | |
262 | static int kvm_gmem_migrate_folio(struct address_space *mapping, |
263 | struct folio *dst, struct folio *src, |
264 | enum migrate_mode mode) |
265 | { |
266 | WARN_ON_ONCE(1); |
267 | return -EINVAL; |
268 | } |
269 | |
270 | static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio) |
271 | { |
272 | struct list_head *gmem_list = &mapping->i_private_list; |
273 | struct kvm_gmem *gmem; |
274 | pgoff_t start, end; |
275 | |
276 | filemap_invalidate_lock_shared(mapping); |
277 | |
278 | start = folio->index; |
279 | end = start + folio_nr_pages(folio); |
280 | |
281 | list_for_each_entry(gmem, gmem_list, entry) |
282 | kvm_gmem_invalidate_begin(gmem, start, end); |
283 | |
284 | /* |
285 | * Do not truncate the range, what action is taken in response to the |
286 | * error is userspace's decision (assuming the architecture supports |
287 | * gracefully handling memory errors). If/when the guest attempts to |
288 | * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON, |
289 | * at which point KVM can either terminate the VM or propagate the |
290 | * error to userspace. |
291 | */ |
292 | |
293 | list_for_each_entry(gmem, gmem_list, entry) |
294 | kvm_gmem_invalidate_end(gmem, start, end); |
295 | |
296 | filemap_invalidate_unlock_shared(mapping); |
297 | |
298 | return MF_DELAYED; |
299 | } |
300 | |
301 | static const struct address_space_operations kvm_gmem_aops = { |
302 | .dirty_folio = noop_dirty_folio, |
303 | .migrate_folio = kvm_gmem_migrate_folio, |
304 | .error_remove_folio = kvm_gmem_error_folio, |
305 | }; |
306 | |
307 | static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path, |
308 | struct kstat *stat, u32 request_mask, |
309 | unsigned int query_flags) |
310 | { |
311 | struct inode *inode = path->dentry->d_inode; |
312 | |
313 | generic_fillattr(idmap, request_mask, inode, stat); |
314 | return 0; |
315 | } |
316 | |
317 | static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, |
318 | struct iattr *attr) |
319 | { |
320 | return -EINVAL; |
321 | } |
322 | static const struct inode_operations kvm_gmem_iops = { |
323 | .getattr = kvm_gmem_getattr, |
324 | .setattr = kvm_gmem_setattr, |
325 | }; |
326 | |
327 | static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) |
328 | { |
329 | const char *anon_name = "[kvm-gmem]" ; |
330 | struct kvm_gmem *gmem; |
331 | struct inode *inode; |
332 | struct file *file; |
333 | int fd, err; |
334 | |
335 | fd = get_unused_fd_flags(flags: 0); |
336 | if (fd < 0) |
337 | return fd; |
338 | |
339 | gmem = kzalloc(size: sizeof(*gmem), GFP_KERNEL); |
340 | if (!gmem) { |
341 | err = -ENOMEM; |
342 | goto err_fd; |
343 | } |
344 | |
345 | file = anon_inode_create_getfile(name: anon_name, fops: &kvm_gmem_fops, priv: gmem, |
346 | O_RDWR, NULL); |
347 | if (IS_ERR(ptr: file)) { |
348 | err = PTR_ERR(ptr: file); |
349 | goto err_gmem; |
350 | } |
351 | |
352 | file->f_flags |= O_LARGEFILE; |
353 | |
354 | inode = file->f_inode; |
355 | WARN_ON(file->f_mapping != inode->i_mapping); |
356 | |
357 | inode->i_private = (void *)(unsigned long)flags; |
358 | inode->i_op = &kvm_gmem_iops; |
359 | inode->i_mapping->a_ops = &kvm_gmem_aops; |
360 | inode->i_mode |= S_IFREG; |
361 | inode->i_size = size; |
362 | mapping_set_gfp_mask(m: inode->i_mapping, GFP_HIGHUSER); |
363 | mapping_set_unmovable(mapping: inode->i_mapping); |
364 | /* Unmovable mappings are supposed to be marked unevictable as well. */ |
365 | WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); |
366 | |
367 | kvm_get_kvm(kvm); |
368 | gmem->kvm = kvm; |
369 | xa_init(xa: &gmem->bindings); |
370 | list_add(new: &gmem->entry, head: &inode->i_mapping->i_private_list); |
371 | |
372 | fd_install(fd, file); |
373 | return fd; |
374 | |
375 | err_gmem: |
376 | kfree(objp: gmem); |
377 | err_fd: |
378 | put_unused_fd(fd); |
379 | return err; |
380 | } |
381 | |
382 | int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) |
383 | { |
384 | loff_t size = args->size; |
385 | u64 flags = args->flags; |
386 | u64 valid_flags = 0; |
387 | |
388 | if (flags & ~valid_flags) |
389 | return -EINVAL; |
390 | |
391 | if (size <= 0 || !PAGE_ALIGNED(size)) |
392 | return -EINVAL; |
393 | |
394 | return __kvm_gmem_create(kvm, size, flags); |
395 | } |
396 | |
397 | int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, |
398 | unsigned int fd, loff_t offset) |
399 | { |
400 | loff_t size = slot->npages << PAGE_SHIFT; |
401 | unsigned long start, end; |
402 | struct kvm_gmem *gmem; |
403 | struct inode *inode; |
404 | struct file *file; |
405 | int r = -EINVAL; |
406 | |
407 | BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff)); |
408 | |
409 | file = fget(fd); |
410 | if (!file) |
411 | return -EBADF; |
412 | |
413 | if (file->f_op != &kvm_gmem_fops) |
414 | goto err; |
415 | |
416 | gmem = file->private_data; |
417 | if (gmem->kvm != kvm) |
418 | goto err; |
419 | |
420 | inode = file_inode(f: file); |
421 | |
422 | if (offset < 0 || !PAGE_ALIGNED(offset) || |
423 | offset + size > i_size_read(inode)) |
424 | goto err; |
425 | |
426 | filemap_invalidate_lock(mapping: inode->i_mapping); |
427 | |
428 | start = offset >> PAGE_SHIFT; |
429 | end = start + slot->npages; |
430 | |
431 | if (!xa_empty(xa: &gmem->bindings) && |
432 | xa_find(xa: &gmem->bindings, index: &start, max: end - 1, XA_PRESENT)) { |
433 | filemap_invalidate_unlock(mapping: inode->i_mapping); |
434 | goto err; |
435 | } |
436 | |
437 | /* |
438 | * No synchronize_rcu() needed, any in-flight readers are guaranteed to |
439 | * be see either a NULL file or this new file, no need for them to go |
440 | * away. |
441 | */ |
442 | rcu_assign_pointer(slot->gmem.file, file); |
443 | slot->gmem.pgoff = start; |
444 | |
445 | xa_store_range(&gmem->bindings, first: start, last: end - 1, entry: slot, GFP_KERNEL); |
446 | filemap_invalidate_unlock(mapping: inode->i_mapping); |
447 | |
448 | /* |
449 | * Drop the reference to the file, even on success. The file pins KVM, |
450 | * not the other way 'round. Active bindings are invalidated if the |
451 | * file is closed before memslots are destroyed. |
452 | */ |
453 | r = 0; |
454 | err: |
455 | fput(file); |
456 | return r; |
457 | } |
458 | |
459 | void kvm_gmem_unbind(struct kvm_memory_slot *slot) |
460 | { |
461 | unsigned long start = slot->gmem.pgoff; |
462 | unsigned long end = start + slot->npages; |
463 | struct kvm_gmem *gmem; |
464 | struct file *file; |
465 | |
466 | /* |
467 | * Nothing to do if the underlying file was already closed (or is being |
468 | * closed right now), kvm_gmem_release() invalidates all bindings. |
469 | */ |
470 | file = kvm_gmem_get_file(slot); |
471 | if (!file) |
472 | return; |
473 | |
474 | gmem = file->private_data; |
475 | |
476 | filemap_invalidate_lock(mapping: file->f_mapping); |
477 | xa_store_range(&gmem->bindings, first: start, last: end - 1, NULL, GFP_KERNEL); |
478 | rcu_assign_pointer(slot->gmem.file, NULL); |
479 | synchronize_rcu(); |
480 | filemap_invalidate_unlock(mapping: file->f_mapping); |
481 | |
482 | fput(file); |
483 | } |
484 | |
485 | int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, |
486 | gfn_t gfn, kvm_pfn_t *pfn, int *max_order) |
487 | { |
488 | pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff; |
489 | struct kvm_gmem *gmem; |
490 | struct folio *folio; |
491 | struct page *page; |
492 | struct file *file; |
493 | int r; |
494 | |
495 | file = kvm_gmem_get_file(slot); |
496 | if (!file) |
497 | return -EFAULT; |
498 | |
499 | gmem = file->private_data; |
500 | |
501 | if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) { |
502 | r = -EIO; |
503 | goto out_fput; |
504 | } |
505 | |
506 | folio = kvm_gmem_get_folio(inode: file_inode(f: file), index); |
507 | if (!folio) { |
508 | r = -ENOMEM; |
509 | goto out_fput; |
510 | } |
511 | |
512 | if (folio_test_hwpoison(folio)) { |
513 | r = -EHWPOISON; |
514 | goto out_unlock; |
515 | } |
516 | |
517 | page = folio_file_page(folio, index); |
518 | |
519 | *pfn = page_to_pfn(page); |
520 | if (max_order) |
521 | *max_order = 0; |
522 | |
523 | r = 0; |
524 | |
525 | out_unlock: |
526 | folio_unlock(folio); |
527 | out_fput: |
528 | fput(file); |
529 | |
530 | return r; |
531 | } |
532 | EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); |
533 | |