1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* Copyright(c) 2016-2018 Intel Corporation. All rights reserved. */ |
3 | #include <linux/memremap.h> |
4 | #include <linux/pagemap.h> |
5 | #include <linux/module.h> |
6 | #include <linux/device.h> |
7 | #include <linux/pfn_t.h> |
8 | #include <linux/cdev.h> |
9 | #include <linux/slab.h> |
10 | #include <linux/dax.h> |
11 | #include <linux/fs.h> |
12 | #include <linux/mm.h> |
13 | #include <linux/mman.h> |
14 | #include "dax-private.h" |
15 | #include "bus.h" |
16 | |
17 | static struct dev_dax *ref_to_dev_dax(struct percpu_ref *ref) |
18 | { |
19 | return container_of(ref, struct dev_dax, ref); |
20 | } |
21 | |
22 | static void dev_dax_percpu_release(struct percpu_ref *ref) |
23 | { |
24 | struct dev_dax *dev_dax = ref_to_dev_dax(ref); |
25 | |
26 | dev_dbg(&dev_dax->dev, "%s\n" , __func__); |
27 | complete(&dev_dax->cmp); |
28 | } |
29 | |
30 | static void dev_dax_percpu_exit(void *data) |
31 | { |
32 | struct percpu_ref *ref = data; |
33 | struct dev_dax *dev_dax = ref_to_dev_dax(ref); |
34 | |
35 | dev_dbg(&dev_dax->dev, "%s\n" , __func__); |
36 | wait_for_completion(&dev_dax->cmp); |
37 | percpu_ref_exit(ref); |
38 | } |
39 | |
40 | static void dev_dax_percpu_kill(struct percpu_ref *data) |
41 | { |
42 | struct percpu_ref *ref = data; |
43 | struct dev_dax *dev_dax = ref_to_dev_dax(ref); |
44 | |
45 | dev_dbg(&dev_dax->dev, "%s\n" , __func__); |
46 | percpu_ref_kill(ref); |
47 | } |
48 | |
49 | static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, |
50 | const char *func) |
51 | { |
52 | struct dax_region *dax_region = dev_dax->region; |
53 | struct device *dev = &dev_dax->dev; |
54 | unsigned long mask; |
55 | |
56 | if (!dax_alive(dev_dax->dax_dev)) |
57 | return -ENXIO; |
58 | |
59 | /* prevent private mappings from being established */ |
60 | if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) { |
61 | dev_info_ratelimited(dev, |
62 | "%s: %s: fail, attempted private mapping\n" , |
63 | current->comm, func); |
64 | return -EINVAL; |
65 | } |
66 | |
67 | mask = dax_region->align - 1; |
68 | if (vma->vm_start & mask || vma->vm_end & mask) { |
69 | dev_info_ratelimited(dev, |
70 | "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n" , |
71 | current->comm, func, vma->vm_start, vma->vm_end, |
72 | mask); |
73 | return -EINVAL; |
74 | } |
75 | |
76 | if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV |
77 | && (vma->vm_flags & VM_DONTCOPY) == 0) { |
78 | dev_info_ratelimited(dev, |
79 | "%s: %s: fail, dax range requires MADV_DONTFORK\n" , |
80 | current->comm, func); |
81 | return -EINVAL; |
82 | } |
83 | |
84 | if (!vma_is_dax(vma)) { |
85 | dev_info_ratelimited(dev, |
86 | "%s: %s: fail, vma is not DAX capable\n" , |
87 | current->comm, func); |
88 | return -EINVAL; |
89 | } |
90 | |
91 | return 0; |
92 | } |
93 | |
94 | /* see "strong" declaration in tools/testing/nvdimm/dax-dev.c */ |
95 | __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, |
96 | unsigned long size) |
97 | { |
98 | struct resource *res = &dev_dax->region->res; |
99 | phys_addr_t phys; |
100 | |
101 | phys = pgoff * PAGE_SIZE + res->start; |
102 | if (phys >= res->start && phys <= res->end) { |
103 | if (phys + size - 1 <= res->end) |
104 | return phys; |
105 | } |
106 | |
107 | return -1; |
108 | } |
109 | |
110 | static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, |
111 | struct vm_fault *vmf, pfn_t *pfn) |
112 | { |
113 | struct device *dev = &dev_dax->dev; |
114 | struct dax_region *dax_region; |
115 | phys_addr_t phys; |
116 | unsigned int fault_size = PAGE_SIZE; |
117 | |
118 | if (check_vma(dev_dax, vmf->vma, __func__)) |
119 | return VM_FAULT_SIGBUS; |
120 | |
121 | dax_region = dev_dax->region; |
122 | if (dax_region->align > PAGE_SIZE) { |
123 | dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n" , |
124 | dax_region->align, fault_size); |
125 | return VM_FAULT_SIGBUS; |
126 | } |
127 | |
128 | if (fault_size != dax_region->align) |
129 | return VM_FAULT_SIGBUS; |
130 | |
131 | phys = dax_pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE); |
132 | if (phys == -1) { |
133 | dev_dbg(dev, "pgoff_to_phys(%#lx) failed\n" , vmf->pgoff); |
134 | return VM_FAULT_SIGBUS; |
135 | } |
136 | |
137 | *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); |
138 | |
139 | return vmf_insert_mixed(vmf->vma, vmf->address, *pfn); |
140 | } |
141 | |
142 | static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, |
143 | struct vm_fault *vmf, pfn_t *pfn) |
144 | { |
145 | unsigned long pmd_addr = vmf->address & PMD_MASK; |
146 | struct device *dev = &dev_dax->dev; |
147 | struct dax_region *dax_region; |
148 | phys_addr_t phys; |
149 | pgoff_t pgoff; |
150 | unsigned int fault_size = PMD_SIZE; |
151 | |
152 | if (check_vma(dev_dax, vmf->vma, __func__)) |
153 | return VM_FAULT_SIGBUS; |
154 | |
155 | dax_region = dev_dax->region; |
156 | if (dax_region->align > PMD_SIZE) { |
157 | dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n" , |
158 | dax_region->align, fault_size); |
159 | return VM_FAULT_SIGBUS; |
160 | } |
161 | |
162 | /* dax pmd mappings require pfn_t_devmap() */ |
163 | if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { |
164 | dev_dbg(dev, "region lacks devmap flags\n" ); |
165 | return VM_FAULT_SIGBUS; |
166 | } |
167 | |
168 | if (fault_size < dax_region->align) |
169 | return VM_FAULT_SIGBUS; |
170 | else if (fault_size > dax_region->align) |
171 | return VM_FAULT_FALLBACK; |
172 | |
173 | /* if we are outside of the VMA */ |
174 | if (pmd_addr < vmf->vma->vm_start || |
175 | (pmd_addr + PMD_SIZE) > vmf->vma->vm_end) |
176 | return VM_FAULT_SIGBUS; |
177 | |
178 | pgoff = linear_page_index(vmf->vma, pmd_addr); |
179 | phys = dax_pgoff_to_phys(dev_dax, pgoff, PMD_SIZE); |
180 | if (phys == -1) { |
181 | dev_dbg(dev, "pgoff_to_phys(%#lx) failed\n" , pgoff); |
182 | return VM_FAULT_SIGBUS; |
183 | } |
184 | |
185 | *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); |
186 | |
187 | return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, *pfn, |
188 | vmf->flags & FAULT_FLAG_WRITE); |
189 | } |
190 | |
191 | #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD |
192 | static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, |
193 | struct vm_fault *vmf, pfn_t *pfn) |
194 | { |
195 | unsigned long pud_addr = vmf->address & PUD_MASK; |
196 | struct device *dev = &dev_dax->dev; |
197 | struct dax_region *dax_region; |
198 | phys_addr_t phys; |
199 | pgoff_t pgoff; |
200 | unsigned int fault_size = PUD_SIZE; |
201 | |
202 | |
203 | if (check_vma(dev_dax, vmf->vma, __func__)) |
204 | return VM_FAULT_SIGBUS; |
205 | |
206 | dax_region = dev_dax->region; |
207 | if (dax_region->align > PUD_SIZE) { |
208 | dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n" , |
209 | dax_region->align, fault_size); |
210 | return VM_FAULT_SIGBUS; |
211 | } |
212 | |
213 | /* dax pud mappings require pfn_t_devmap() */ |
214 | if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { |
215 | dev_dbg(dev, "region lacks devmap flags\n" ); |
216 | return VM_FAULT_SIGBUS; |
217 | } |
218 | |
219 | if (fault_size < dax_region->align) |
220 | return VM_FAULT_SIGBUS; |
221 | else if (fault_size > dax_region->align) |
222 | return VM_FAULT_FALLBACK; |
223 | |
224 | /* if we are outside of the VMA */ |
225 | if (pud_addr < vmf->vma->vm_start || |
226 | (pud_addr + PUD_SIZE) > vmf->vma->vm_end) |
227 | return VM_FAULT_SIGBUS; |
228 | |
229 | pgoff = linear_page_index(vmf->vma, pud_addr); |
230 | phys = dax_pgoff_to_phys(dev_dax, pgoff, PUD_SIZE); |
231 | if (phys == -1) { |
232 | dev_dbg(dev, "pgoff_to_phys(%#lx) failed\n" , pgoff); |
233 | return VM_FAULT_SIGBUS; |
234 | } |
235 | |
236 | *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); |
237 | |
238 | return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, *pfn, |
239 | vmf->flags & FAULT_FLAG_WRITE); |
240 | } |
241 | #else |
242 | static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, |
243 | struct vm_fault *vmf, pfn_t *pfn) |
244 | { |
245 | return VM_FAULT_FALLBACK; |
246 | } |
247 | #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ |
248 | |
249 | static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, |
250 | enum page_entry_size pe_size) |
251 | { |
252 | struct file *filp = vmf->vma->vm_file; |
253 | unsigned long fault_size; |
254 | vm_fault_t rc = VM_FAULT_SIGBUS; |
255 | int id; |
256 | pfn_t pfn; |
257 | struct dev_dax *dev_dax = filp->private_data; |
258 | |
259 | dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) size = %d\n" , current->comm, |
260 | (vmf->flags & FAULT_FLAG_WRITE) ? "write" : "read" , |
261 | vmf->vma->vm_start, vmf->vma->vm_end, pe_size); |
262 | |
263 | id = dax_read_lock(); |
264 | switch (pe_size) { |
265 | case PE_SIZE_PTE: |
266 | fault_size = PAGE_SIZE; |
267 | rc = __dev_dax_pte_fault(dev_dax, vmf, &pfn); |
268 | break; |
269 | case PE_SIZE_PMD: |
270 | fault_size = PMD_SIZE; |
271 | rc = __dev_dax_pmd_fault(dev_dax, vmf, &pfn); |
272 | break; |
273 | case PE_SIZE_PUD: |
274 | fault_size = PUD_SIZE; |
275 | rc = __dev_dax_pud_fault(dev_dax, vmf, &pfn); |
276 | break; |
277 | default: |
278 | rc = VM_FAULT_SIGBUS; |
279 | } |
280 | |
281 | if (rc == VM_FAULT_NOPAGE) { |
282 | unsigned long i; |
283 | pgoff_t pgoff; |
284 | |
285 | /* |
286 | * In the device-dax case the only possibility for a |
287 | * VM_FAULT_NOPAGE result is when device-dax capacity is |
288 | * mapped. No need to consider the zero page, or racing |
289 | * conflicting mappings. |
290 | */ |
291 | pgoff = linear_page_index(vmf->vma, vmf->address |
292 | & ~(fault_size - 1)); |
293 | for (i = 0; i < fault_size / PAGE_SIZE; i++) { |
294 | struct page *page; |
295 | |
296 | page = pfn_to_page(pfn_t_to_pfn(pfn) + i); |
297 | if (page->mapping) |
298 | continue; |
299 | page->mapping = filp->f_mapping; |
300 | page->index = pgoff + i; |
301 | } |
302 | } |
303 | dax_read_unlock(id); |
304 | |
305 | return rc; |
306 | } |
307 | |
308 | static vm_fault_t dev_dax_fault(struct vm_fault *vmf) |
309 | { |
310 | return dev_dax_huge_fault(vmf, PE_SIZE_PTE); |
311 | } |
312 | |
313 | static int dev_dax_split(struct vm_area_struct *vma, unsigned long addr) |
314 | { |
315 | struct file *filp = vma->vm_file; |
316 | struct dev_dax *dev_dax = filp->private_data; |
317 | struct dax_region *dax_region = dev_dax->region; |
318 | |
319 | if (!IS_ALIGNED(addr, dax_region->align)) |
320 | return -EINVAL; |
321 | return 0; |
322 | } |
323 | |
324 | static unsigned long dev_dax_pagesize(struct vm_area_struct *vma) |
325 | { |
326 | struct file *filp = vma->vm_file; |
327 | struct dev_dax *dev_dax = filp->private_data; |
328 | struct dax_region *dax_region = dev_dax->region; |
329 | |
330 | return dax_region->align; |
331 | } |
332 | |
333 | static const struct vm_operations_struct dax_vm_ops = { |
334 | .fault = dev_dax_fault, |
335 | .huge_fault = dev_dax_huge_fault, |
336 | .split = dev_dax_split, |
337 | .pagesize = dev_dax_pagesize, |
338 | }; |
339 | |
340 | static int dax_mmap(struct file *filp, struct vm_area_struct *vma) |
341 | { |
342 | struct dev_dax *dev_dax = filp->private_data; |
343 | int rc, id; |
344 | |
345 | dev_dbg(&dev_dax->dev, "trace\n" ); |
346 | |
347 | /* |
348 | * We lock to check dax_dev liveness and will re-check at |
349 | * fault time. |
350 | */ |
351 | id = dax_read_lock(); |
352 | rc = check_vma(dev_dax, vma, __func__); |
353 | dax_read_unlock(id); |
354 | if (rc) |
355 | return rc; |
356 | |
357 | vma->vm_ops = &dax_vm_ops; |
358 | vma->vm_flags |= VM_HUGEPAGE; |
359 | return 0; |
360 | } |
361 | |
362 | /* return an unmapped area aligned to the dax region specified alignment */ |
363 | static unsigned long dax_get_unmapped_area(struct file *filp, |
364 | unsigned long addr, unsigned long len, unsigned long pgoff, |
365 | unsigned long flags) |
366 | { |
367 | unsigned long off, off_end, off_align, len_align, addr_align, align; |
368 | struct dev_dax *dev_dax = filp ? filp->private_data : NULL; |
369 | struct dax_region *dax_region; |
370 | |
371 | if (!dev_dax || addr) |
372 | goto out; |
373 | |
374 | dax_region = dev_dax->region; |
375 | align = dax_region->align; |
376 | off = pgoff << PAGE_SHIFT; |
377 | off_end = off + len; |
378 | off_align = round_up(off, align); |
379 | |
380 | if ((off_end <= off_align) || ((off_end - off_align) < align)) |
381 | goto out; |
382 | |
383 | len_align = len + align; |
384 | if ((off + len_align) < off) |
385 | goto out; |
386 | |
387 | addr_align = current->mm->get_unmapped_area(filp, addr, len_align, |
388 | pgoff, flags); |
389 | if (!IS_ERR_VALUE(addr_align)) { |
390 | addr_align += (off - addr_align) & (align - 1); |
391 | return addr_align; |
392 | } |
393 | out: |
394 | return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); |
395 | } |
396 | |
397 | static const struct address_space_operations dev_dax_aops = { |
398 | .set_page_dirty = noop_set_page_dirty, |
399 | .invalidatepage = noop_invalidatepage, |
400 | }; |
401 | |
402 | static int dax_open(struct inode *inode, struct file *filp) |
403 | { |
404 | struct dax_device *dax_dev = inode_dax(inode); |
405 | struct inode *__dax_inode = dax_inode(dax_dev); |
406 | struct dev_dax *dev_dax = dax_get_private(dax_dev); |
407 | |
408 | dev_dbg(&dev_dax->dev, "trace\n" ); |
409 | inode->i_mapping = __dax_inode->i_mapping; |
410 | inode->i_mapping->host = __dax_inode; |
411 | inode->i_mapping->a_ops = &dev_dax_aops; |
412 | filp->f_mapping = inode->i_mapping; |
413 | filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); |
414 | filp->private_data = dev_dax; |
415 | inode->i_flags = S_DAX; |
416 | |
417 | return 0; |
418 | } |
419 | |
420 | static int dax_release(struct inode *inode, struct file *filp) |
421 | { |
422 | struct dev_dax *dev_dax = filp->private_data; |
423 | |
424 | dev_dbg(&dev_dax->dev, "trace\n" ); |
425 | return 0; |
426 | } |
427 | |
428 | static const struct file_operations dax_fops = { |
429 | .llseek = noop_llseek, |
430 | .owner = THIS_MODULE, |
431 | .open = dax_open, |
432 | .release = dax_release, |
433 | .get_unmapped_area = dax_get_unmapped_area, |
434 | .mmap = dax_mmap, |
435 | .mmap_supported_flags = MAP_SYNC, |
436 | }; |
437 | |
438 | static void dev_dax_cdev_del(void *cdev) |
439 | { |
440 | cdev_del(cdev); |
441 | } |
442 | |
443 | static void dev_dax_kill(void *dev_dax) |
444 | { |
445 | kill_dev_dax(dev_dax); |
446 | } |
447 | |
448 | int dev_dax_probe(struct device *dev) |
449 | { |
450 | struct dev_dax *dev_dax = to_dev_dax(dev); |
451 | struct dax_device *dax_dev = dev_dax->dax_dev; |
452 | struct resource *res = &dev_dax->region->res; |
453 | struct inode *inode; |
454 | struct cdev *cdev; |
455 | void *addr; |
456 | int rc; |
457 | |
458 | /* 1:1 map region resource range to device-dax instance range */ |
459 | if (!devm_request_mem_region(dev, res->start, resource_size(res), |
460 | dev_name(dev))) { |
461 | dev_warn(dev, "could not reserve region %pR\n" , res); |
462 | return -EBUSY; |
463 | } |
464 | |
465 | init_completion(&dev_dax->cmp); |
466 | rc = percpu_ref_init(&dev_dax->ref, dev_dax_percpu_release, 0, |
467 | GFP_KERNEL); |
468 | if (rc) |
469 | return rc; |
470 | |
471 | rc = devm_add_action_or_reset(dev, dev_dax_percpu_exit, &dev_dax->ref); |
472 | if (rc) |
473 | return rc; |
474 | |
475 | dev_dax->pgmap.ref = &dev_dax->ref; |
476 | dev_dax->pgmap.kill = dev_dax_percpu_kill; |
477 | addr = devm_memremap_pages(dev, &dev_dax->pgmap); |
478 | if (IS_ERR(addr)) { |
479 | devm_remove_action(dev, dev_dax_percpu_exit, &dev_dax->ref); |
480 | percpu_ref_exit(&dev_dax->ref); |
481 | return PTR_ERR(addr); |
482 | } |
483 | |
484 | inode = dax_inode(dax_dev); |
485 | cdev = inode->i_cdev; |
486 | cdev_init(cdev, &dax_fops); |
487 | if (dev->class) { |
488 | /* for the CONFIG_DEV_DAX_PMEM_COMPAT case */ |
489 | cdev->owner = dev->parent->driver->owner; |
490 | } else |
491 | cdev->owner = dev->driver->owner; |
492 | cdev_set_parent(cdev, &dev->kobj); |
493 | rc = cdev_add(cdev, dev->devt, 1); |
494 | if (rc) |
495 | return rc; |
496 | |
497 | rc = devm_add_action_or_reset(dev, dev_dax_cdev_del, cdev); |
498 | if (rc) |
499 | return rc; |
500 | |
501 | run_dax(dax_dev); |
502 | return devm_add_action_or_reset(dev, dev_dax_kill, dev_dax); |
503 | } |
504 | EXPORT_SYMBOL_GPL(dev_dax_probe); |
505 | |
506 | static int dev_dax_remove(struct device *dev) |
507 | { |
508 | /* all probe actions are unwound by devm */ |
509 | return 0; |
510 | } |
511 | |
512 | static struct dax_device_driver device_dax_driver = { |
513 | .drv = { |
514 | .probe = dev_dax_probe, |
515 | .remove = dev_dax_remove, |
516 | }, |
517 | .match_always = 1, |
518 | }; |
519 | |
520 | static int __init dax_init(void) |
521 | { |
522 | return dax_driver_register(&device_dax_driver); |
523 | } |
524 | |
525 | static void __exit dax_exit(void) |
526 | { |
527 | dax_driver_unregister(&device_dax_driver); |
528 | } |
529 | |
530 | MODULE_AUTHOR("Intel Corporation" ); |
531 | MODULE_LICENSE("GPL v2" ); |
532 | module_init(dax_init); |
533 | module_exit(dax_exit); |
534 | MODULE_ALIAS_DAX_DEVICE(0); |
535 | |