1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* Copyright(c) 2016-2018 Intel Corporation. All rights reserved. */ |
3 | #include <linux/memremap.h> |
4 | #include <linux/pagemap.h> |
5 | #include <linux/module.h> |
6 | #include <linux/device.h> |
7 | #include <linux/pfn_t.h> |
8 | #include <linux/cdev.h> |
9 | #include <linux/slab.h> |
10 | #include <linux/dax.h> |
11 | #include <linux/fs.h> |
12 | #include <linux/mm.h> |
13 | #include <linux/mman.h> |
14 | #include "dax-private.h" |
15 | #include "bus.h" |
16 | |
17 | static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, |
18 | const char *func) |
19 | { |
20 | struct device *dev = &dev_dax->dev; |
21 | unsigned long mask; |
22 | |
23 | if (!dax_alive(dax_dev: dev_dax->dax_dev)) |
24 | return -ENXIO; |
25 | |
26 | /* prevent private mappings from being established */ |
27 | if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) { |
28 | dev_info_ratelimited(dev, |
29 | "%s: %s: fail, attempted private mapping\n" , |
30 | current->comm, func); |
31 | return -EINVAL; |
32 | } |
33 | |
34 | mask = dev_dax->align - 1; |
35 | if (vma->vm_start & mask || vma->vm_end & mask) { |
36 | dev_info_ratelimited(dev, |
37 | "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n" , |
38 | current->comm, func, vma->vm_start, vma->vm_end, |
39 | mask); |
40 | return -EINVAL; |
41 | } |
42 | |
43 | if (!vma_is_dax(vma)) { |
44 | dev_info_ratelimited(dev, |
45 | "%s: %s: fail, vma is not DAX capable\n" , |
46 | current->comm, func); |
47 | return -EINVAL; |
48 | } |
49 | |
50 | return 0; |
51 | } |
52 | |
53 | /* see "strong" declaration in tools/testing/nvdimm/dax-dev.c */ |
54 | __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, |
55 | unsigned long size) |
56 | { |
57 | int i; |
58 | |
59 | for (i = 0; i < dev_dax->nr_range; i++) { |
60 | struct dev_dax_range *dax_range = &dev_dax->ranges[i]; |
61 | struct range *range = &dax_range->range; |
62 | unsigned long long pgoff_end; |
63 | phys_addr_t phys; |
64 | |
65 | pgoff_end = dax_range->pgoff + PHYS_PFN(range_len(range)) - 1; |
66 | if (pgoff < dax_range->pgoff || pgoff > pgoff_end) |
67 | continue; |
68 | phys = PFN_PHYS(pgoff - dax_range->pgoff) + range->start; |
69 | if (phys + size - 1 <= range->end) |
70 | return phys; |
71 | break; |
72 | } |
73 | return -1; |
74 | } |
75 | |
76 | static void dax_set_mapping(struct vm_fault *vmf, pfn_t pfn, |
77 | unsigned long fault_size) |
78 | { |
79 | unsigned long i, nr_pages = fault_size / PAGE_SIZE; |
80 | struct file *filp = vmf->vma->vm_file; |
81 | struct dev_dax *dev_dax = filp->private_data; |
82 | pgoff_t pgoff; |
83 | |
84 | /* mapping is only set on the head */ |
85 | if (dev_dax->pgmap->vmemmap_shift) |
86 | nr_pages = 1; |
87 | |
88 | pgoff = linear_page_index(vma: vmf->vma, |
89 | ALIGN(vmf->address, fault_size)); |
90 | |
91 | for (i = 0; i < nr_pages; i++) { |
92 | struct page *page = pfn_to_page(pfn_t_to_pfn(pfn) + i); |
93 | |
94 | page = compound_head(page); |
95 | if (page->mapping) |
96 | continue; |
97 | |
98 | page->mapping = filp->f_mapping; |
99 | page->index = pgoff + i; |
100 | } |
101 | } |
102 | |
103 | static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, |
104 | struct vm_fault *vmf) |
105 | { |
106 | struct device *dev = &dev_dax->dev; |
107 | phys_addr_t phys; |
108 | pfn_t pfn; |
109 | unsigned int fault_size = PAGE_SIZE; |
110 | |
111 | if (check_vma(dev_dax, vma: vmf->vma, func: __func__)) |
112 | return VM_FAULT_SIGBUS; |
113 | |
114 | if (dev_dax->align > PAGE_SIZE) { |
115 | dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n" , |
116 | dev_dax->align, fault_size); |
117 | return VM_FAULT_SIGBUS; |
118 | } |
119 | |
120 | if (fault_size != dev_dax->align) |
121 | return VM_FAULT_SIGBUS; |
122 | |
123 | phys = dax_pgoff_to_phys(dev_dax, pgoff: vmf->pgoff, PAGE_SIZE); |
124 | if (phys == -1) { |
125 | dev_dbg(dev, "pgoff_to_phys(%#lx) failed\n" , vmf->pgoff); |
126 | return VM_FAULT_SIGBUS; |
127 | } |
128 | |
129 | pfn = phys_to_pfn_t(addr: phys, PFN_DEV|PFN_MAP); |
130 | |
131 | dax_set_mapping(vmf, pfn, fault_size); |
132 | |
133 | return vmf_insert_mixed(vma: vmf->vma, addr: vmf->address, pfn); |
134 | } |
135 | |
136 | static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, |
137 | struct vm_fault *vmf) |
138 | { |
139 | unsigned long pmd_addr = vmf->address & PMD_MASK; |
140 | struct device *dev = &dev_dax->dev; |
141 | phys_addr_t phys; |
142 | pgoff_t pgoff; |
143 | pfn_t pfn; |
144 | unsigned int fault_size = PMD_SIZE; |
145 | |
146 | if (check_vma(dev_dax, vma: vmf->vma, func: __func__)) |
147 | return VM_FAULT_SIGBUS; |
148 | |
149 | if (dev_dax->align > PMD_SIZE) { |
150 | dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n" , |
151 | dev_dax->align, fault_size); |
152 | return VM_FAULT_SIGBUS; |
153 | } |
154 | |
155 | if (fault_size < dev_dax->align) |
156 | return VM_FAULT_SIGBUS; |
157 | else if (fault_size > dev_dax->align) |
158 | return VM_FAULT_FALLBACK; |
159 | |
160 | /* if we are outside of the VMA */ |
161 | if (pmd_addr < vmf->vma->vm_start || |
162 | (pmd_addr + PMD_SIZE) > vmf->vma->vm_end) |
163 | return VM_FAULT_SIGBUS; |
164 | |
165 | pgoff = linear_page_index(vma: vmf->vma, address: pmd_addr); |
166 | phys = dax_pgoff_to_phys(dev_dax, pgoff, PMD_SIZE); |
167 | if (phys == -1) { |
168 | dev_dbg(dev, "pgoff_to_phys(%#lx) failed\n" , pgoff); |
169 | return VM_FAULT_SIGBUS; |
170 | } |
171 | |
172 | pfn = phys_to_pfn_t(addr: phys, PFN_DEV|PFN_MAP); |
173 | |
174 | dax_set_mapping(vmf, pfn, fault_size); |
175 | |
176 | return vmf_insert_pfn_pmd(vmf, pfn, write: vmf->flags & FAULT_FLAG_WRITE); |
177 | } |
178 | |
179 | #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD |
180 | static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, |
181 | struct vm_fault *vmf) |
182 | { |
183 | unsigned long pud_addr = vmf->address & PUD_MASK; |
184 | struct device *dev = &dev_dax->dev; |
185 | phys_addr_t phys; |
186 | pgoff_t pgoff; |
187 | pfn_t pfn; |
188 | unsigned int fault_size = PUD_SIZE; |
189 | |
190 | |
191 | if (check_vma(dev_dax, vma: vmf->vma, func: __func__)) |
192 | return VM_FAULT_SIGBUS; |
193 | |
194 | if (dev_dax->align > PUD_SIZE) { |
195 | dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n" , |
196 | dev_dax->align, fault_size); |
197 | return VM_FAULT_SIGBUS; |
198 | } |
199 | |
200 | if (fault_size < dev_dax->align) |
201 | return VM_FAULT_SIGBUS; |
202 | else if (fault_size > dev_dax->align) |
203 | return VM_FAULT_FALLBACK; |
204 | |
205 | /* if we are outside of the VMA */ |
206 | if (pud_addr < vmf->vma->vm_start || |
207 | (pud_addr + PUD_SIZE) > vmf->vma->vm_end) |
208 | return VM_FAULT_SIGBUS; |
209 | |
210 | pgoff = linear_page_index(vma: vmf->vma, address: pud_addr); |
211 | phys = dax_pgoff_to_phys(dev_dax, pgoff, PUD_SIZE); |
212 | if (phys == -1) { |
213 | dev_dbg(dev, "pgoff_to_phys(%#lx) failed\n" , pgoff); |
214 | return VM_FAULT_SIGBUS; |
215 | } |
216 | |
217 | pfn = phys_to_pfn_t(addr: phys, PFN_DEV|PFN_MAP); |
218 | |
219 | dax_set_mapping(vmf, pfn, fault_size); |
220 | |
221 | return vmf_insert_pfn_pud(vmf, pfn, write: vmf->flags & FAULT_FLAG_WRITE); |
222 | } |
223 | #else |
224 | static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, |
225 | struct vm_fault *vmf) |
226 | { |
227 | return VM_FAULT_FALLBACK; |
228 | } |
229 | #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ |
230 | |
231 | static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, unsigned int order) |
232 | { |
233 | struct file *filp = vmf->vma->vm_file; |
234 | vm_fault_t rc = VM_FAULT_SIGBUS; |
235 | int id; |
236 | struct dev_dax *dev_dax = filp->private_data; |
237 | |
238 | dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) order:%d\n" , current->comm, |
239 | (vmf->flags & FAULT_FLAG_WRITE) ? "write" : "read" , |
240 | vmf->vma->vm_start, vmf->vma->vm_end, order); |
241 | |
242 | id = dax_read_lock(); |
243 | if (order == 0) |
244 | rc = __dev_dax_pte_fault(dev_dax, vmf); |
245 | else if (order == PMD_ORDER) |
246 | rc = __dev_dax_pmd_fault(dev_dax, vmf); |
247 | else if (order == PUD_ORDER) |
248 | rc = __dev_dax_pud_fault(dev_dax, vmf); |
249 | else |
250 | rc = VM_FAULT_SIGBUS; |
251 | |
252 | dax_read_unlock(id); |
253 | |
254 | return rc; |
255 | } |
256 | |
257 | static vm_fault_t dev_dax_fault(struct vm_fault *vmf) |
258 | { |
259 | return dev_dax_huge_fault(vmf, order: 0); |
260 | } |
261 | |
262 | static int dev_dax_may_split(struct vm_area_struct *vma, unsigned long addr) |
263 | { |
264 | struct file *filp = vma->vm_file; |
265 | struct dev_dax *dev_dax = filp->private_data; |
266 | |
267 | if (!IS_ALIGNED(addr, dev_dax->align)) |
268 | return -EINVAL; |
269 | return 0; |
270 | } |
271 | |
272 | static unsigned long dev_dax_pagesize(struct vm_area_struct *vma) |
273 | { |
274 | struct file *filp = vma->vm_file; |
275 | struct dev_dax *dev_dax = filp->private_data; |
276 | |
277 | return dev_dax->align; |
278 | } |
279 | |
280 | static const struct vm_operations_struct dax_vm_ops = { |
281 | .fault = dev_dax_fault, |
282 | .huge_fault = dev_dax_huge_fault, |
283 | .may_split = dev_dax_may_split, |
284 | .pagesize = dev_dax_pagesize, |
285 | }; |
286 | |
287 | static int dax_mmap(struct file *filp, struct vm_area_struct *vma) |
288 | { |
289 | struct dev_dax *dev_dax = filp->private_data; |
290 | int rc, id; |
291 | |
292 | dev_dbg(&dev_dax->dev, "trace\n" ); |
293 | |
294 | /* |
295 | * We lock to check dax_dev liveness and will re-check at |
296 | * fault time. |
297 | */ |
298 | id = dax_read_lock(); |
299 | rc = check_vma(dev_dax, vma, func: __func__); |
300 | dax_read_unlock(id); |
301 | if (rc) |
302 | return rc; |
303 | |
304 | vma->vm_ops = &dax_vm_ops; |
305 | vm_flags_set(vma, VM_HUGEPAGE); |
306 | return 0; |
307 | } |
308 | |
309 | /* return an unmapped area aligned to the dax region specified alignment */ |
310 | static unsigned long dax_get_unmapped_area(struct file *filp, |
311 | unsigned long addr, unsigned long len, unsigned long pgoff, |
312 | unsigned long flags) |
313 | { |
314 | unsigned long off, off_end, off_align, len_align, addr_align, align; |
315 | struct dev_dax *dev_dax = filp ? filp->private_data : NULL; |
316 | |
317 | if (!dev_dax || addr) |
318 | goto out; |
319 | |
320 | align = dev_dax->align; |
321 | off = pgoff << PAGE_SHIFT; |
322 | off_end = off + len; |
323 | off_align = round_up(off, align); |
324 | |
325 | if ((off_end <= off_align) || ((off_end - off_align) < align)) |
326 | goto out; |
327 | |
328 | len_align = len + align; |
329 | if ((off + len_align) < off) |
330 | goto out; |
331 | |
332 | addr_align = current->mm->get_unmapped_area(filp, addr, len_align, |
333 | pgoff, flags); |
334 | if (!IS_ERR_VALUE(addr_align)) { |
335 | addr_align += (off - addr_align) & (align - 1); |
336 | return addr_align; |
337 | } |
338 | out: |
339 | return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); |
340 | } |
341 | |
342 | static const struct address_space_operations dev_dax_aops = { |
343 | .dirty_folio = noop_dirty_folio, |
344 | }; |
345 | |
346 | static int dax_open(struct inode *inode, struct file *filp) |
347 | { |
348 | struct dax_device *dax_dev = inode_dax(inode); |
349 | struct inode *__dax_inode = dax_inode(dax_dev); |
350 | struct dev_dax *dev_dax = dax_get_private(dax_dev); |
351 | |
352 | dev_dbg(&dev_dax->dev, "trace\n" ); |
353 | inode->i_mapping = __dax_inode->i_mapping; |
354 | inode->i_mapping->host = __dax_inode; |
355 | inode->i_mapping->a_ops = &dev_dax_aops; |
356 | filp->f_mapping = inode->i_mapping; |
357 | filp->f_wb_err = filemap_sample_wb_err(mapping: filp->f_mapping); |
358 | filp->f_sb_err = file_sample_sb_err(file: filp); |
359 | filp->private_data = dev_dax; |
360 | inode->i_flags = S_DAX; |
361 | |
362 | return 0; |
363 | } |
364 | |
365 | static int dax_release(struct inode *inode, struct file *filp) |
366 | { |
367 | struct dev_dax *dev_dax = filp->private_data; |
368 | |
369 | dev_dbg(&dev_dax->dev, "trace\n" ); |
370 | return 0; |
371 | } |
372 | |
373 | static const struct file_operations dax_fops = { |
374 | .llseek = noop_llseek, |
375 | .owner = THIS_MODULE, |
376 | .open = dax_open, |
377 | .release = dax_release, |
378 | .get_unmapped_area = dax_get_unmapped_area, |
379 | .mmap = dax_mmap, |
380 | .mmap_supported_flags = MAP_SYNC, |
381 | }; |
382 | |
383 | static void dev_dax_cdev_del(void *cdev) |
384 | { |
385 | cdev_del(cdev); |
386 | } |
387 | |
388 | static void dev_dax_kill(void *dev_dax) |
389 | { |
390 | kill_dev_dax(dev_dax); |
391 | } |
392 | |
393 | static int dev_dax_probe(struct dev_dax *dev_dax) |
394 | { |
395 | struct dax_device *dax_dev = dev_dax->dax_dev; |
396 | struct device *dev = &dev_dax->dev; |
397 | struct dev_pagemap *pgmap; |
398 | struct inode *inode; |
399 | struct cdev *cdev; |
400 | void *addr; |
401 | int rc, i; |
402 | |
403 | if (static_dev_dax(dev_dax)) { |
404 | if (dev_dax->nr_range > 1) { |
405 | dev_warn(dev, |
406 | "static pgmap / multi-range device conflict\n" ); |
407 | return -EINVAL; |
408 | } |
409 | |
410 | pgmap = dev_dax->pgmap; |
411 | } else { |
412 | if (dev_dax->pgmap) { |
413 | dev_warn(dev, |
414 | "dynamic-dax with pre-populated page map\n" ); |
415 | return -EINVAL; |
416 | } |
417 | |
418 | pgmap = devm_kzalloc(dev, |
419 | struct_size(pgmap, ranges, dev_dax->nr_range - 1), |
420 | GFP_KERNEL); |
421 | if (!pgmap) |
422 | return -ENOMEM; |
423 | |
424 | pgmap->nr_range = dev_dax->nr_range; |
425 | dev_dax->pgmap = pgmap; |
426 | |
427 | for (i = 0; i < dev_dax->nr_range; i++) { |
428 | struct range *range = &dev_dax->ranges[i].range; |
429 | pgmap->ranges[i] = *range; |
430 | } |
431 | } |
432 | |
433 | for (i = 0; i < dev_dax->nr_range; i++) { |
434 | struct range *range = &dev_dax->ranges[i].range; |
435 | |
436 | if (!devm_request_mem_region(dev, range->start, |
437 | range_len(range), dev_name(dev))) { |
438 | dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve range\n" , |
439 | i, range->start, range->end); |
440 | return -EBUSY; |
441 | } |
442 | } |
443 | |
444 | pgmap->type = MEMORY_DEVICE_GENERIC; |
445 | if (dev_dax->align > PAGE_SIZE) |
446 | pgmap->vmemmap_shift = |
447 | order_base_2(dev_dax->align >> PAGE_SHIFT); |
448 | addr = devm_memremap_pages(dev, pgmap); |
449 | if (IS_ERR(ptr: addr)) |
450 | return PTR_ERR(ptr: addr); |
451 | |
452 | inode = dax_inode(dax_dev); |
453 | cdev = inode->i_cdev; |
454 | cdev_init(cdev, &dax_fops); |
455 | cdev->owner = dev->driver->owner; |
456 | cdev_set_parent(p: cdev, kobj: &dev->kobj); |
457 | rc = cdev_add(cdev, dev->devt, 1); |
458 | if (rc) |
459 | return rc; |
460 | |
461 | rc = devm_add_action_or_reset(dev, dev_dax_cdev_del, cdev); |
462 | if (rc) |
463 | return rc; |
464 | |
465 | run_dax(dax_dev); |
466 | return devm_add_action_or_reset(dev, dev_dax_kill, dev_dax); |
467 | } |
468 | |
469 | static struct dax_device_driver device_dax_driver = { |
470 | .probe = dev_dax_probe, |
471 | .type = DAXDRV_DEVICE_TYPE, |
472 | }; |
473 | |
474 | static int __init dax_init(void) |
475 | { |
476 | return dax_driver_register(&device_dax_driver); |
477 | } |
478 | |
479 | static void __exit dax_exit(void) |
480 | { |
481 | dax_driver_unregister(dax_drv: &device_dax_driver); |
482 | } |
483 | |
484 | MODULE_AUTHOR("Intel Corporation" ); |
485 | MODULE_LICENSE("GPL v2" ); |
486 | module_init(dax_init); |
487 | module_exit(dax_exit); |
488 | MODULE_ALIAS_DAX_DEVICE(0); |
489 | |