1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * PCI Peer 2 Peer DMA support. |
4 | * |
5 | * Copyright (c) 2016-2018, Logan Gunthorpe |
6 | * Copyright (c) 2016-2017, Microsemi Corporation |
7 | * Copyright (c) 2017, Christoph Hellwig |
8 | * Copyright (c) 2018, Eideticom Inc. |
9 | */ |
10 | |
11 | #define pr_fmt(fmt) "pci-p2pdma: " fmt |
12 | #include <linux/ctype.h> |
13 | #include <linux/dma-map-ops.h> |
14 | #include <linux/pci-p2pdma.h> |
15 | #include <linux/module.h> |
16 | #include <linux/slab.h> |
17 | #include <linux/genalloc.h> |
18 | #include <linux/memremap.h> |
19 | #include <linux/percpu-refcount.h> |
20 | #include <linux/random.h> |
21 | #include <linux/seq_buf.h> |
22 | #include <linux/xarray.h> |
23 | |
24 | struct pci_p2pdma { |
25 | struct gen_pool *pool; |
26 | bool p2pmem_published; |
27 | struct xarray map_types; |
28 | }; |
29 | |
30 | struct pci_p2pdma_pagemap { |
31 | struct pci_dev *provider; |
32 | u64 bus_offset; |
33 | struct dev_pagemap pgmap; |
34 | }; |
35 | |
36 | static struct pci_p2pdma_pagemap *to_p2p_pgmap(struct dev_pagemap *pgmap) |
37 | { |
38 | return container_of(pgmap, struct pci_p2pdma_pagemap, pgmap); |
39 | } |
40 | |
41 | static ssize_t size_show(struct device *dev, struct device_attribute *attr, |
42 | char *buf) |
43 | { |
44 | struct pci_dev *pdev = to_pci_dev(dev); |
45 | struct pci_p2pdma *p2pdma; |
46 | size_t size = 0; |
47 | |
48 | rcu_read_lock(); |
49 | p2pdma = rcu_dereference(pdev->p2pdma); |
50 | if (p2pdma && p2pdma->pool) |
51 | size = gen_pool_size(p2pdma->pool); |
52 | rcu_read_unlock(); |
53 | |
54 | return sysfs_emit(buf, fmt: "%zd\n" , size); |
55 | } |
56 | static DEVICE_ATTR_RO(size); |
57 | |
58 | static ssize_t available_show(struct device *dev, struct device_attribute *attr, |
59 | char *buf) |
60 | { |
61 | struct pci_dev *pdev = to_pci_dev(dev); |
62 | struct pci_p2pdma *p2pdma; |
63 | size_t avail = 0; |
64 | |
65 | rcu_read_lock(); |
66 | p2pdma = rcu_dereference(pdev->p2pdma); |
67 | if (p2pdma && p2pdma->pool) |
68 | avail = gen_pool_avail(p2pdma->pool); |
69 | rcu_read_unlock(); |
70 | |
71 | return sysfs_emit(buf, fmt: "%zd\n" , avail); |
72 | } |
73 | static DEVICE_ATTR_RO(available); |
74 | |
75 | static ssize_t published_show(struct device *dev, struct device_attribute *attr, |
76 | char *buf) |
77 | { |
78 | struct pci_dev *pdev = to_pci_dev(dev); |
79 | struct pci_p2pdma *p2pdma; |
80 | bool published = false; |
81 | |
82 | rcu_read_lock(); |
83 | p2pdma = rcu_dereference(pdev->p2pdma); |
84 | if (p2pdma) |
85 | published = p2pdma->p2pmem_published; |
86 | rcu_read_unlock(); |
87 | |
88 | return sysfs_emit(buf, fmt: "%d\n" , published); |
89 | } |
90 | static DEVICE_ATTR_RO(published); |
91 | |
92 | static int p2pmem_alloc_mmap(struct file *filp, struct kobject *kobj, |
93 | struct bin_attribute *attr, struct vm_area_struct *vma) |
94 | { |
95 | struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj)); |
96 | size_t len = vma->vm_end - vma->vm_start; |
97 | struct pci_p2pdma *p2pdma; |
98 | struct percpu_ref *ref; |
99 | unsigned long vaddr; |
100 | void *kaddr; |
101 | int ret; |
102 | |
103 | /* prevent private mappings from being established */ |
104 | if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) { |
105 | pci_info_ratelimited(pdev, |
106 | "%s: fail, attempted private mapping\n" , |
107 | current->comm); |
108 | return -EINVAL; |
109 | } |
110 | |
111 | if (vma->vm_pgoff) { |
112 | pci_info_ratelimited(pdev, |
113 | "%s: fail, attempted mapping with non-zero offset\n" , |
114 | current->comm); |
115 | return -EINVAL; |
116 | } |
117 | |
118 | rcu_read_lock(); |
119 | p2pdma = rcu_dereference(pdev->p2pdma); |
120 | if (!p2pdma) { |
121 | ret = -ENODEV; |
122 | goto out; |
123 | } |
124 | |
125 | kaddr = (void *)gen_pool_alloc_owner(pool: p2pdma->pool, size: len, owner: (void **)&ref); |
126 | if (!kaddr) { |
127 | ret = -ENOMEM; |
128 | goto out; |
129 | } |
130 | |
131 | /* |
132 | * vm_insert_page() can sleep, so a reference is taken to mapping |
133 | * such that rcu_read_unlock() can be done before inserting the |
134 | * pages |
135 | */ |
136 | if (unlikely(!percpu_ref_tryget_live_rcu(ref))) { |
137 | ret = -ENODEV; |
138 | goto out_free_mem; |
139 | } |
140 | rcu_read_unlock(); |
141 | |
142 | for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) { |
143 | ret = vm_insert_page(vma, addr: vaddr, virt_to_page(kaddr)); |
144 | if (ret) { |
145 | gen_pool_free(pool: p2pdma->pool, addr: (uintptr_t)kaddr, size: len); |
146 | return ret; |
147 | } |
148 | percpu_ref_get(ref); |
149 | put_page(virt_to_page(kaddr)); |
150 | kaddr += PAGE_SIZE; |
151 | len -= PAGE_SIZE; |
152 | } |
153 | |
154 | percpu_ref_put(ref); |
155 | |
156 | return 0; |
157 | out_free_mem: |
158 | gen_pool_free(pool: p2pdma->pool, addr: (uintptr_t)kaddr, size: len); |
159 | out: |
160 | rcu_read_unlock(); |
161 | return ret; |
162 | } |
163 | |
164 | static struct bin_attribute p2pmem_alloc_attr = { |
165 | .attr = { .name = "allocate" , .mode = 0660 }, |
166 | .mmap = p2pmem_alloc_mmap, |
167 | /* |
168 | * Some places where we want to call mmap (ie. python) will check |
169 | * that the file size is greater than the mmap size before allowing |
170 | * the mmap to continue. To work around this, just set the size |
171 | * to be very large. |
172 | */ |
173 | .size = SZ_1T, |
174 | }; |
175 | |
176 | static struct attribute *p2pmem_attrs[] = { |
177 | &dev_attr_size.attr, |
178 | &dev_attr_available.attr, |
179 | &dev_attr_published.attr, |
180 | NULL, |
181 | }; |
182 | |
183 | static struct bin_attribute *p2pmem_bin_attrs[] = { |
184 | &p2pmem_alloc_attr, |
185 | NULL, |
186 | }; |
187 | |
188 | static const struct attribute_group p2pmem_group = { |
189 | .attrs = p2pmem_attrs, |
190 | .bin_attrs = p2pmem_bin_attrs, |
191 | .name = "p2pmem" , |
192 | }; |
193 | |
194 | static void p2pdma_page_free(struct page *page) |
195 | { |
196 | struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(pgmap: page->pgmap); |
197 | /* safe to dereference while a reference is held to the percpu ref */ |
198 | struct pci_p2pdma *p2pdma = |
199 | rcu_dereference_protected(pgmap->provider->p2pdma, 1); |
200 | struct percpu_ref *ref; |
201 | |
202 | gen_pool_free_owner(pool: p2pdma->pool, addr: (uintptr_t)page_to_virt(page), |
203 | PAGE_SIZE, owner: (void **)&ref); |
204 | percpu_ref_put(ref); |
205 | } |
206 | |
207 | static const struct dev_pagemap_ops p2pdma_pgmap_ops = { |
208 | .page_free = p2pdma_page_free, |
209 | }; |
210 | |
211 | static void pci_p2pdma_release(void *data) |
212 | { |
213 | struct pci_dev *pdev = data; |
214 | struct pci_p2pdma *p2pdma; |
215 | |
216 | p2pdma = rcu_dereference_protected(pdev->p2pdma, 1); |
217 | if (!p2pdma) |
218 | return; |
219 | |
220 | /* Flush and disable pci_alloc_p2p_mem() */ |
221 | pdev->p2pdma = NULL; |
222 | synchronize_rcu(); |
223 | |
224 | gen_pool_destroy(p2pdma->pool); |
225 | sysfs_remove_group(kobj: &pdev->dev.kobj, grp: &p2pmem_group); |
226 | xa_destroy(&p2pdma->map_types); |
227 | } |
228 | |
229 | static int pci_p2pdma_setup(struct pci_dev *pdev) |
230 | { |
231 | int error = -ENOMEM; |
232 | struct pci_p2pdma *p2p; |
233 | |
234 | p2p = devm_kzalloc(dev: &pdev->dev, size: sizeof(*p2p), GFP_KERNEL); |
235 | if (!p2p) |
236 | return -ENOMEM; |
237 | |
238 | xa_init(xa: &p2p->map_types); |
239 | |
240 | p2p->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(dev: &pdev->dev)); |
241 | if (!p2p->pool) |
242 | goto out; |
243 | |
244 | error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev); |
245 | if (error) |
246 | goto out_pool_destroy; |
247 | |
248 | error = sysfs_create_group(kobj: &pdev->dev.kobj, grp: &p2pmem_group); |
249 | if (error) |
250 | goto out_pool_destroy; |
251 | |
252 | rcu_assign_pointer(pdev->p2pdma, p2p); |
253 | return 0; |
254 | |
255 | out_pool_destroy: |
256 | gen_pool_destroy(p2p->pool); |
257 | out: |
258 | devm_kfree(dev: &pdev->dev, p: p2p); |
259 | return error; |
260 | } |
261 | |
262 | static void pci_p2pdma_unmap_mappings(void *data) |
263 | { |
264 | struct pci_dev *pdev = data; |
265 | |
266 | /* |
267 | * Removing the alloc attribute from sysfs will call |
268 | * unmap_mapping_range() on the inode, teardown any existing userspace |
269 | * mappings and prevent new ones from being created. |
270 | */ |
271 | sysfs_remove_file_from_group(kobj: &pdev->dev.kobj, attr: &p2pmem_alloc_attr.attr, |
272 | group: p2pmem_group.name); |
273 | } |
274 | |
275 | /** |
276 | * pci_p2pdma_add_resource - add memory for use as p2p memory |
277 | * @pdev: the device to add the memory to |
278 | * @bar: PCI BAR to add |
279 | * @size: size of the memory to add, may be zero to use the whole BAR |
280 | * @offset: offset into the PCI BAR |
281 | * |
282 | * The memory will be given ZONE_DEVICE struct pages so that it may |
283 | * be used with any DMA request. |
284 | */ |
285 | int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, |
286 | u64 offset) |
287 | { |
288 | struct pci_p2pdma_pagemap *p2p_pgmap; |
289 | struct dev_pagemap *pgmap; |
290 | struct pci_p2pdma *p2pdma; |
291 | void *addr; |
292 | int error; |
293 | |
294 | if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM)) |
295 | return -EINVAL; |
296 | |
297 | if (offset >= pci_resource_len(pdev, bar)) |
298 | return -EINVAL; |
299 | |
300 | if (!size) |
301 | size = pci_resource_len(pdev, bar) - offset; |
302 | |
303 | if (size + offset > pci_resource_len(pdev, bar)) |
304 | return -EINVAL; |
305 | |
306 | if (!pdev->p2pdma) { |
307 | error = pci_p2pdma_setup(pdev); |
308 | if (error) |
309 | return error; |
310 | } |
311 | |
312 | p2p_pgmap = devm_kzalloc(dev: &pdev->dev, size: sizeof(*p2p_pgmap), GFP_KERNEL); |
313 | if (!p2p_pgmap) |
314 | return -ENOMEM; |
315 | |
316 | pgmap = &p2p_pgmap->pgmap; |
317 | pgmap->range.start = pci_resource_start(pdev, bar) + offset; |
318 | pgmap->range.end = pgmap->range.start + size - 1; |
319 | pgmap->nr_range = 1; |
320 | pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; |
321 | pgmap->ops = &p2pdma_pgmap_ops; |
322 | |
323 | p2p_pgmap->provider = pdev; |
324 | p2p_pgmap->bus_offset = pci_bus_address(pdev, bar) - |
325 | pci_resource_start(pdev, bar); |
326 | |
327 | addr = devm_memremap_pages(dev: &pdev->dev, pgmap); |
328 | if (IS_ERR(ptr: addr)) { |
329 | error = PTR_ERR(ptr: addr); |
330 | goto pgmap_free; |
331 | } |
332 | |
333 | error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_unmap_mappings, |
334 | pdev); |
335 | if (error) |
336 | goto pages_free; |
337 | |
338 | p2pdma = rcu_dereference_protected(pdev->p2pdma, 1); |
339 | error = gen_pool_add_owner(p2pdma->pool, (unsigned long)addr, |
340 | pci_bus_address(pdev, bar) + offset, |
341 | range_len(range: &pgmap->range), dev_to_node(dev: &pdev->dev), |
342 | &pgmap->ref); |
343 | if (error) |
344 | goto pages_free; |
345 | |
346 | pci_info(pdev, "added peer-to-peer DMA memory %#llx-%#llx\n" , |
347 | pgmap->range.start, pgmap->range.end); |
348 | |
349 | return 0; |
350 | |
351 | pages_free: |
352 | devm_memunmap_pages(dev: &pdev->dev, pgmap); |
353 | pgmap_free: |
354 | devm_kfree(dev: &pdev->dev, p: pgmap); |
355 | return error; |
356 | } |
357 | EXPORT_SYMBOL_GPL(pci_p2pdma_add_resource); |
358 | |
359 | /* |
360 | * Note this function returns the parent PCI device with a |
361 | * reference taken. It is the caller's responsibility to drop |
362 | * the reference. |
363 | */ |
364 | static struct pci_dev *find_parent_pci_dev(struct device *dev) |
365 | { |
366 | struct device *parent; |
367 | |
368 | dev = get_device(dev); |
369 | |
370 | while (dev) { |
371 | if (dev_is_pci(dev)) |
372 | return to_pci_dev(dev); |
373 | |
374 | parent = get_device(dev: dev->parent); |
375 | put_device(dev); |
376 | dev = parent; |
377 | } |
378 | |
379 | return NULL; |
380 | } |
381 | |
382 | /* |
383 | * Check if a PCI bridge has its ACS redirection bits set to redirect P2P |
384 | * TLPs upstream via ACS. Returns 1 if the packets will be redirected |
385 | * upstream, 0 otherwise. |
386 | */ |
387 | static int pci_bridge_has_acs_redir(struct pci_dev *pdev) |
388 | { |
389 | int pos; |
390 | u16 ctrl; |
391 | |
392 | pos = pdev->acs_cap; |
393 | if (!pos) |
394 | return 0; |
395 | |
396 | pci_read_config_word(dev: pdev, where: pos + PCI_ACS_CTRL, val: &ctrl); |
397 | |
398 | if (ctrl & (PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_EC)) |
399 | return 1; |
400 | |
401 | return 0; |
402 | } |
403 | |
404 | static void seq_buf_print_bus_devfn(struct seq_buf *buf, struct pci_dev *pdev) |
405 | { |
406 | if (!buf) |
407 | return; |
408 | |
409 | seq_buf_printf(s: buf, fmt: "%s;" , pci_name(pdev)); |
410 | } |
411 | |
412 | static bool cpu_supports_p2pdma(void) |
413 | { |
414 | #ifdef CONFIG_X86 |
415 | struct cpuinfo_x86 *c = &cpu_data(0); |
416 | |
417 | /* Any AMD CPU whose family ID is Zen or newer supports p2pdma */ |
418 | if (c->x86_vendor == X86_VENDOR_AMD && c->x86 >= 0x17) |
419 | return true; |
420 | #endif |
421 | |
422 | return false; |
423 | } |
424 | |
425 | static const struct pci_p2pdma_whitelist_entry { |
426 | unsigned short vendor; |
427 | unsigned short device; |
428 | enum { |
429 | REQ_SAME_HOST_BRIDGE = 1 << 0, |
430 | } flags; |
431 | } pci_p2pdma_whitelist[] = { |
432 | /* Intel Xeon E5/Core i7 */ |
433 | {PCI_VENDOR_ID_INTEL, 0x3c00, REQ_SAME_HOST_BRIDGE}, |
434 | {PCI_VENDOR_ID_INTEL, 0x3c01, REQ_SAME_HOST_BRIDGE}, |
435 | /* Intel Xeon E7 v3/Xeon E5 v3/Core i7 */ |
436 | {PCI_VENDOR_ID_INTEL, 0x2f00, REQ_SAME_HOST_BRIDGE}, |
437 | {PCI_VENDOR_ID_INTEL, 0x2f01, REQ_SAME_HOST_BRIDGE}, |
438 | /* Intel Skylake-E */ |
439 | {PCI_VENDOR_ID_INTEL, 0x2030, 0}, |
440 | {PCI_VENDOR_ID_INTEL, 0x2031, 0}, |
441 | {PCI_VENDOR_ID_INTEL, 0x2032, 0}, |
442 | {PCI_VENDOR_ID_INTEL, 0x2033, 0}, |
443 | {PCI_VENDOR_ID_INTEL, 0x2020, 0}, |
444 | {PCI_VENDOR_ID_INTEL, 0x09a2, 0}, |
445 | {} |
446 | }; |
447 | |
448 | /* |
449 | * If the first device on host's root bus is either devfn 00.0 or a PCIe |
450 | * Root Port, return it. Otherwise return NULL. |
451 | * |
452 | * We often use a devfn 00.0 "host bridge" in the pci_p2pdma_whitelist[] |
453 | * (though there is no PCI/PCIe requirement for such a device). On some |
454 | * platforms, e.g., Intel Skylake, there is no such host bridge device, and |
455 | * pci_p2pdma_whitelist[] may contain a Root Port at any devfn. |
456 | * |
457 | * This function is similar to pci_get_slot(host->bus, 0), but it does |
458 | * not take the pci_bus_sem lock since __host_bridge_whitelist() must not |
459 | * sleep. |
460 | * |
461 | * For this to be safe, the caller should hold a reference to a device on the |
462 | * bridge, which should ensure the host_bridge device will not be freed |
463 | * or removed from the head of the devices list. |
464 | */ |
465 | static struct pci_dev *pci_host_bridge_dev(struct pci_host_bridge *host) |
466 | { |
467 | struct pci_dev *root; |
468 | |
469 | root = list_first_entry_or_null(&host->bus->devices, |
470 | struct pci_dev, bus_list); |
471 | |
472 | if (!root) |
473 | return NULL; |
474 | |
475 | if (root->devfn == PCI_DEVFN(0, 0)) |
476 | return root; |
477 | |
478 | if (pci_pcie_type(dev: root) == PCI_EXP_TYPE_ROOT_PORT) |
479 | return root; |
480 | |
481 | return NULL; |
482 | } |
483 | |
484 | static bool __host_bridge_whitelist(struct pci_host_bridge *host, |
485 | bool same_host_bridge, bool warn) |
486 | { |
487 | struct pci_dev *root = pci_host_bridge_dev(host); |
488 | const struct pci_p2pdma_whitelist_entry *entry; |
489 | unsigned short vendor, device; |
490 | |
491 | if (!root) |
492 | return false; |
493 | |
494 | vendor = root->vendor; |
495 | device = root->device; |
496 | |
497 | for (entry = pci_p2pdma_whitelist; entry->vendor; entry++) { |
498 | if (vendor != entry->vendor || device != entry->device) |
499 | continue; |
500 | if (entry->flags & REQ_SAME_HOST_BRIDGE && !same_host_bridge) |
501 | return false; |
502 | |
503 | return true; |
504 | } |
505 | |
506 | if (warn) |
507 | pci_warn(root, "Host bridge not in P2PDMA whitelist: %04x:%04x\n" , |
508 | vendor, device); |
509 | |
510 | return false; |
511 | } |
512 | |
513 | /* |
514 | * If we can't find a common upstream bridge take a look at the root |
515 | * complex and compare it to a whitelist of known good hardware. |
516 | */ |
517 | static bool host_bridge_whitelist(struct pci_dev *a, struct pci_dev *b, |
518 | bool warn) |
519 | { |
520 | struct pci_host_bridge *host_a = pci_find_host_bridge(bus: a->bus); |
521 | struct pci_host_bridge *host_b = pci_find_host_bridge(bus: b->bus); |
522 | |
523 | if (host_a == host_b) |
524 | return __host_bridge_whitelist(host: host_a, same_host_bridge: true, warn); |
525 | |
526 | if (__host_bridge_whitelist(host: host_a, same_host_bridge: false, warn) && |
527 | __host_bridge_whitelist(host: host_b, same_host_bridge: false, warn)) |
528 | return true; |
529 | |
530 | return false; |
531 | } |
532 | |
533 | static unsigned long map_types_idx(struct pci_dev *client) |
534 | { |
535 | return (pci_domain_nr(bus: client->bus) << 16) | pci_dev_id(dev: client); |
536 | } |
537 | |
538 | /* |
539 | * Calculate the P2PDMA mapping type and distance between two PCI devices. |
540 | * |
541 | * If the two devices are the same PCI function, return |
542 | * PCI_P2PDMA_MAP_BUS_ADDR and a distance of 0. |
543 | * |
544 | * If they are two functions of the same device, return |
545 | * PCI_P2PDMA_MAP_BUS_ADDR and a distance of 2 (one hop up to the bridge, |
546 | * then one hop back down to another function of the same device). |
547 | * |
548 | * In the case where two devices are connected to the same PCIe switch, |
549 | * return a distance of 4. This corresponds to the following PCI tree: |
550 | * |
551 | * -+ Root Port |
552 | * \+ Switch Upstream Port |
553 | * +-+ Switch Downstream Port 0 |
554 | * + \- Device A |
555 | * \-+ Switch Downstream Port 1 |
556 | * \- Device B |
557 | * |
558 | * The distance is 4 because we traverse from Device A to Downstream Port 0 |
559 | * to the common Switch Upstream Port, back down to Downstream Port 1 and |
560 | * then to Device B. The mapping type returned depends on the ACS |
561 | * redirection setting of the ports along the path. |
562 | * |
563 | * If ACS redirect is set on any port in the path, traffic between the |
564 | * devices will go through the host bridge, so return |
565 | * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE; otherwise return |
566 | * PCI_P2PDMA_MAP_BUS_ADDR. |
567 | * |
568 | * Any two devices that have a data path that goes through the host bridge |
569 | * will consult a whitelist. If the host bridge is in the whitelist, return |
570 | * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE with the distance set to the number of |
571 | * ports per above. If the device is not in the whitelist, return |
572 | * PCI_P2PDMA_MAP_NOT_SUPPORTED. |
573 | */ |
574 | static enum pci_p2pdma_map_type |
575 | calc_map_type_and_dist(struct pci_dev *provider, struct pci_dev *client, |
576 | int *dist, bool verbose) |
577 | { |
578 | enum pci_p2pdma_map_type map_type = PCI_P2PDMA_MAP_THRU_HOST_BRIDGE; |
579 | struct pci_dev *a = provider, *b = client, *bb; |
580 | bool acs_redirects = false; |
581 | struct pci_p2pdma *p2pdma; |
582 | struct seq_buf acs_list; |
583 | int acs_cnt = 0; |
584 | int dist_a = 0; |
585 | int dist_b = 0; |
586 | char buf[128]; |
587 | |
588 | seq_buf_init(s: &acs_list, buf, size: sizeof(buf)); |
589 | |
590 | /* |
591 | * Note, we don't need to take references to devices returned by |
592 | * pci_upstream_bridge() seeing we hold a reference to a child |
593 | * device which will already hold a reference to the upstream bridge. |
594 | */ |
595 | while (a) { |
596 | dist_b = 0; |
597 | |
598 | if (pci_bridge_has_acs_redir(pdev: a)) { |
599 | seq_buf_print_bus_devfn(buf: &acs_list, pdev: a); |
600 | acs_cnt++; |
601 | } |
602 | |
603 | bb = b; |
604 | |
605 | while (bb) { |
606 | if (a == bb) |
607 | goto check_b_path_acs; |
608 | |
609 | bb = pci_upstream_bridge(dev: bb); |
610 | dist_b++; |
611 | } |
612 | |
613 | a = pci_upstream_bridge(dev: a); |
614 | dist_a++; |
615 | } |
616 | |
617 | *dist = dist_a + dist_b; |
618 | goto map_through_host_bridge; |
619 | |
620 | check_b_path_acs: |
621 | bb = b; |
622 | |
623 | while (bb) { |
624 | if (a == bb) |
625 | break; |
626 | |
627 | if (pci_bridge_has_acs_redir(pdev: bb)) { |
628 | seq_buf_print_bus_devfn(buf: &acs_list, pdev: bb); |
629 | acs_cnt++; |
630 | } |
631 | |
632 | bb = pci_upstream_bridge(dev: bb); |
633 | } |
634 | |
635 | *dist = dist_a + dist_b; |
636 | |
637 | if (!acs_cnt) { |
638 | map_type = PCI_P2PDMA_MAP_BUS_ADDR; |
639 | goto done; |
640 | } |
641 | |
642 | if (verbose) { |
643 | acs_list.buffer[acs_list.len-1] = 0; /* drop final semicolon */ |
644 | pci_warn(client, "ACS redirect is set between the client and provider (%s)\n" , |
645 | pci_name(provider)); |
646 | pci_warn(client, "to disable ACS redirect for this path, add the kernel parameter: pci=disable_acs_redir=%s\n" , |
647 | acs_list.buffer); |
648 | } |
649 | acs_redirects = true; |
650 | |
651 | map_through_host_bridge: |
652 | if (!cpu_supports_p2pdma() && |
653 | !host_bridge_whitelist(a: provider, b: client, warn: acs_redirects)) { |
654 | if (verbose) |
655 | pci_warn(client, "cannot be used for peer-to-peer DMA as the client and provider (%s) do not share an upstream bridge or whitelisted host bridge\n" , |
656 | pci_name(provider)); |
657 | map_type = PCI_P2PDMA_MAP_NOT_SUPPORTED; |
658 | } |
659 | done: |
660 | rcu_read_lock(); |
661 | p2pdma = rcu_dereference(provider->p2pdma); |
662 | if (p2pdma) |
663 | xa_store(&p2pdma->map_types, index: map_types_idx(client), |
664 | entry: xa_mk_value(v: map_type), GFP_ATOMIC); |
665 | rcu_read_unlock(); |
666 | return map_type; |
667 | } |
668 | |
669 | /** |
670 | * pci_p2pdma_distance_many - Determine the cumulative distance between |
671 | * a p2pdma provider and the clients in use. |
672 | * @provider: p2pdma provider to check against the client list |
673 | * @clients: array of devices to check (NULL-terminated) |
674 | * @num_clients: number of clients in the array |
675 | * @verbose: if true, print warnings for devices when we return -1 |
676 | * |
677 | * Returns -1 if any of the clients are not compatible, otherwise returns a |
678 | * positive number where a lower number is the preferable choice. (If there's |
679 | * one client that's the same as the provider it will return 0, which is best |
680 | * choice). |
681 | * |
682 | * "compatible" means the provider and the clients are either all behind |
683 | * the same PCI root port or the host bridges connected to each of the devices |
684 | * are listed in the 'pci_p2pdma_whitelist'. |
685 | */ |
686 | int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients, |
687 | int num_clients, bool verbose) |
688 | { |
689 | enum pci_p2pdma_map_type map; |
690 | bool not_supported = false; |
691 | struct pci_dev *pci_client; |
692 | int total_dist = 0; |
693 | int i, distance; |
694 | |
695 | if (num_clients == 0) |
696 | return -1; |
697 | |
698 | for (i = 0; i < num_clients; i++) { |
699 | pci_client = find_parent_pci_dev(dev: clients[i]); |
700 | if (!pci_client) { |
701 | if (verbose) |
702 | dev_warn(clients[i], |
703 | "cannot be used for peer-to-peer DMA as it is not a PCI device\n" ); |
704 | return -1; |
705 | } |
706 | |
707 | map = calc_map_type_and_dist(provider, client: pci_client, dist: &distance, |
708 | verbose); |
709 | |
710 | pci_dev_put(dev: pci_client); |
711 | |
712 | if (map == PCI_P2PDMA_MAP_NOT_SUPPORTED) |
713 | not_supported = true; |
714 | |
715 | if (not_supported && !verbose) |
716 | break; |
717 | |
718 | total_dist += distance; |
719 | } |
720 | |
721 | if (not_supported) |
722 | return -1; |
723 | |
724 | return total_dist; |
725 | } |
726 | EXPORT_SYMBOL_GPL(pci_p2pdma_distance_many); |
727 | |
728 | /** |
729 | * pci_has_p2pmem - check if a given PCI device has published any p2pmem |
730 | * @pdev: PCI device to check |
731 | */ |
732 | bool pci_has_p2pmem(struct pci_dev *pdev) |
733 | { |
734 | struct pci_p2pdma *p2pdma; |
735 | bool res; |
736 | |
737 | rcu_read_lock(); |
738 | p2pdma = rcu_dereference(pdev->p2pdma); |
739 | res = p2pdma && p2pdma->p2pmem_published; |
740 | rcu_read_unlock(); |
741 | |
742 | return res; |
743 | } |
744 | EXPORT_SYMBOL_GPL(pci_has_p2pmem); |
745 | |
746 | /** |
747 | * pci_p2pmem_find_many - find a peer-to-peer DMA memory device compatible with |
748 | * the specified list of clients and shortest distance |
749 | * @clients: array of devices to check (NULL-terminated) |
750 | * @num_clients: number of client devices in the list |
751 | * |
752 | * If multiple devices are behind the same switch, the one "closest" to the |
753 | * client devices in use will be chosen first. (So if one of the providers is |
754 | * the same as one of the clients, that provider will be used ahead of any |
755 | * other providers that are unrelated). If multiple providers are an equal |
756 | * distance away, one will be chosen at random. |
757 | * |
758 | * Returns a pointer to the PCI device with a reference taken (use pci_dev_put |
759 | * to return the reference) or NULL if no compatible device is found. The |
760 | * found provider will also be assigned to the client list. |
761 | */ |
762 | struct pci_dev *pci_p2pmem_find_many(struct device **clients, int num_clients) |
763 | { |
764 | struct pci_dev *pdev = NULL; |
765 | int distance; |
766 | int closest_distance = INT_MAX; |
767 | struct pci_dev **closest_pdevs; |
768 | int dev_cnt = 0; |
769 | const int max_devs = PAGE_SIZE / sizeof(*closest_pdevs); |
770 | int i; |
771 | |
772 | closest_pdevs = kmalloc(PAGE_SIZE, GFP_KERNEL); |
773 | if (!closest_pdevs) |
774 | return NULL; |
775 | |
776 | for_each_pci_dev(pdev) { |
777 | if (!pci_has_p2pmem(pdev)) |
778 | continue; |
779 | |
780 | distance = pci_p2pdma_distance_many(pdev, clients, |
781 | num_clients, false); |
782 | if (distance < 0 || distance > closest_distance) |
783 | continue; |
784 | |
785 | if (distance == closest_distance && dev_cnt >= max_devs) |
786 | continue; |
787 | |
788 | if (distance < closest_distance) { |
789 | for (i = 0; i < dev_cnt; i++) |
790 | pci_dev_put(dev: closest_pdevs[i]); |
791 | |
792 | dev_cnt = 0; |
793 | closest_distance = distance; |
794 | } |
795 | |
796 | closest_pdevs[dev_cnt++] = pci_dev_get(dev: pdev); |
797 | } |
798 | |
799 | if (dev_cnt) |
800 | pdev = pci_dev_get(dev: closest_pdevs[get_random_u32_below(ceil: dev_cnt)]); |
801 | |
802 | for (i = 0; i < dev_cnt; i++) |
803 | pci_dev_put(dev: closest_pdevs[i]); |
804 | |
805 | kfree(objp: closest_pdevs); |
806 | return pdev; |
807 | } |
808 | EXPORT_SYMBOL_GPL(pci_p2pmem_find_many); |
809 | |
810 | /** |
811 | * pci_alloc_p2pmem - allocate peer-to-peer DMA memory |
812 | * @pdev: the device to allocate memory from |
813 | * @size: number of bytes to allocate |
814 | * |
815 | * Returns the allocated memory or NULL on error. |
816 | */ |
817 | void *pci_alloc_p2pmem(struct pci_dev *pdev, size_t size) |
818 | { |
819 | void *ret = NULL; |
820 | struct percpu_ref *ref; |
821 | struct pci_p2pdma *p2pdma; |
822 | |
823 | /* |
824 | * Pairs with synchronize_rcu() in pci_p2pdma_release() to |
825 | * ensure pdev->p2pdma is non-NULL for the duration of the |
826 | * read-lock. |
827 | */ |
828 | rcu_read_lock(); |
829 | p2pdma = rcu_dereference(pdev->p2pdma); |
830 | if (unlikely(!p2pdma)) |
831 | goto out; |
832 | |
833 | ret = (void *)gen_pool_alloc_owner(pool: p2pdma->pool, size, owner: (void **) &ref); |
834 | if (!ret) |
835 | goto out; |
836 | |
837 | if (unlikely(!percpu_ref_tryget_live_rcu(ref))) { |
838 | gen_pool_free(pool: p2pdma->pool, addr: (unsigned long) ret, size); |
839 | ret = NULL; |
840 | } |
841 | out: |
842 | rcu_read_unlock(); |
843 | return ret; |
844 | } |
845 | EXPORT_SYMBOL_GPL(pci_alloc_p2pmem); |
846 | |
847 | /** |
848 | * pci_free_p2pmem - free peer-to-peer DMA memory |
849 | * @pdev: the device the memory was allocated from |
850 | * @addr: address of the memory that was allocated |
851 | * @size: number of bytes that were allocated |
852 | */ |
853 | void pci_free_p2pmem(struct pci_dev *pdev, void *addr, size_t size) |
854 | { |
855 | struct percpu_ref *ref; |
856 | struct pci_p2pdma *p2pdma = rcu_dereference_protected(pdev->p2pdma, 1); |
857 | |
858 | gen_pool_free_owner(pool: p2pdma->pool, addr: (uintptr_t)addr, size, |
859 | owner: (void **) &ref); |
860 | percpu_ref_put(ref); |
861 | } |
862 | EXPORT_SYMBOL_GPL(pci_free_p2pmem); |
863 | |
864 | /** |
865 | * pci_p2pmem_virt_to_bus - return the PCI bus address for a given virtual |
866 | * address obtained with pci_alloc_p2pmem() |
867 | * @pdev: the device the memory was allocated from |
868 | * @addr: address of the memory that was allocated |
869 | */ |
870 | pci_bus_addr_t pci_p2pmem_virt_to_bus(struct pci_dev *pdev, void *addr) |
871 | { |
872 | struct pci_p2pdma *p2pdma; |
873 | |
874 | if (!addr) |
875 | return 0; |
876 | |
877 | p2pdma = rcu_dereference_protected(pdev->p2pdma, 1); |
878 | if (!p2pdma) |
879 | return 0; |
880 | |
881 | /* |
882 | * Note: when we added the memory to the pool we used the PCI |
883 | * bus address as the physical address. So gen_pool_virt_to_phys() |
884 | * actually returns the bus address despite the misleading name. |
885 | */ |
886 | return gen_pool_virt_to_phys(pool: p2pdma->pool, (unsigned long)addr); |
887 | } |
888 | EXPORT_SYMBOL_GPL(pci_p2pmem_virt_to_bus); |
889 | |
890 | /** |
891 | * pci_p2pmem_alloc_sgl - allocate peer-to-peer DMA memory in a scatterlist |
892 | * @pdev: the device to allocate memory from |
893 | * @nents: the number of SG entries in the list |
894 | * @length: number of bytes to allocate |
895 | * |
896 | * Return: %NULL on error or &struct scatterlist pointer and @nents on success |
897 | */ |
898 | struct scatterlist *pci_p2pmem_alloc_sgl(struct pci_dev *pdev, |
899 | unsigned int *nents, u32 length) |
900 | { |
901 | struct scatterlist *sg; |
902 | void *addr; |
903 | |
904 | sg = kmalloc(size: sizeof(*sg), GFP_KERNEL); |
905 | if (!sg) |
906 | return NULL; |
907 | |
908 | sg_init_table(sg, 1); |
909 | |
910 | addr = pci_alloc_p2pmem(pdev, length); |
911 | if (!addr) |
912 | goto out_free_sg; |
913 | |
914 | sg_set_buf(sg, buf: addr, buflen: length); |
915 | *nents = 1; |
916 | return sg; |
917 | |
918 | out_free_sg: |
919 | kfree(objp: sg); |
920 | return NULL; |
921 | } |
922 | EXPORT_SYMBOL_GPL(pci_p2pmem_alloc_sgl); |
923 | |
924 | /** |
925 | * pci_p2pmem_free_sgl - free a scatterlist allocated by pci_p2pmem_alloc_sgl() |
926 | * @pdev: the device to allocate memory from |
927 | * @sgl: the allocated scatterlist |
928 | */ |
929 | void pci_p2pmem_free_sgl(struct pci_dev *pdev, struct scatterlist *sgl) |
930 | { |
931 | struct scatterlist *sg; |
932 | int count; |
933 | |
934 | for_each_sg(sgl, sg, INT_MAX, count) { |
935 | if (!sg) |
936 | break; |
937 | |
938 | pci_free_p2pmem(pdev, sg_virt(sg), sg->length); |
939 | } |
940 | kfree(objp: sgl); |
941 | } |
942 | EXPORT_SYMBOL_GPL(pci_p2pmem_free_sgl); |
943 | |
944 | /** |
945 | * pci_p2pmem_publish - publish the peer-to-peer DMA memory for use by |
946 | * other devices with pci_p2pmem_find() |
947 | * @pdev: the device with peer-to-peer DMA memory to publish |
948 | * @publish: set to true to publish the memory, false to unpublish it |
949 | * |
950 | * Published memory can be used by other PCI device drivers for |
951 | * peer-2-peer DMA operations. Non-published memory is reserved for |
952 | * exclusive use of the device driver that registers the peer-to-peer |
953 | * memory. |
954 | */ |
955 | void pci_p2pmem_publish(struct pci_dev *pdev, bool publish) |
956 | { |
957 | struct pci_p2pdma *p2pdma; |
958 | |
959 | rcu_read_lock(); |
960 | p2pdma = rcu_dereference(pdev->p2pdma); |
961 | if (p2pdma) |
962 | p2pdma->p2pmem_published = publish; |
963 | rcu_read_unlock(); |
964 | } |
965 | EXPORT_SYMBOL_GPL(pci_p2pmem_publish); |
966 | |
967 | static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap, |
968 | struct device *dev) |
969 | { |
970 | enum pci_p2pdma_map_type type = PCI_P2PDMA_MAP_NOT_SUPPORTED; |
971 | struct pci_dev *provider = to_p2p_pgmap(pgmap)->provider; |
972 | struct pci_dev *client; |
973 | struct pci_p2pdma *p2pdma; |
974 | int dist; |
975 | |
976 | if (!provider->p2pdma) |
977 | return PCI_P2PDMA_MAP_NOT_SUPPORTED; |
978 | |
979 | if (!dev_is_pci(dev)) |
980 | return PCI_P2PDMA_MAP_NOT_SUPPORTED; |
981 | |
982 | client = to_pci_dev(dev); |
983 | |
984 | rcu_read_lock(); |
985 | p2pdma = rcu_dereference(provider->p2pdma); |
986 | |
987 | if (p2pdma) |
988 | type = xa_to_value(entry: xa_load(&p2pdma->map_types, |
989 | index: map_types_idx(client))); |
990 | rcu_read_unlock(); |
991 | |
992 | if (type == PCI_P2PDMA_MAP_UNKNOWN) |
993 | return calc_map_type_and_dist(provider, client, dist: &dist, verbose: true); |
994 | |
995 | return type; |
996 | } |
997 | |
998 | /** |
999 | * pci_p2pdma_map_segment - map an sg segment determining the mapping type |
1000 | * @state: State structure that should be declared outside of the for_each_sg() |
1001 | * loop and initialized to zero. |
1002 | * @dev: DMA device that's doing the mapping operation |
1003 | * @sg: scatterlist segment to map |
1004 | * |
1005 | * This is a helper to be used by non-IOMMU dma_map_sg() implementations where |
1006 | * the sg segment is the same for the page_link and the dma_address. |
1007 | * |
1008 | * Attempt to map a single segment in an SGL with the PCI bus address. |
1009 | * The segment must point to a PCI P2PDMA page and thus must be |
1010 | * wrapped in a is_pci_p2pdma_page(sg_page(sg)) check. |
1011 | * |
1012 | * Returns the type of mapping used and maps the page if the type is |
1013 | * PCI_P2PDMA_MAP_BUS_ADDR. |
1014 | */ |
1015 | enum pci_p2pdma_map_type |
1016 | pci_p2pdma_map_segment(struct pci_p2pdma_map_state *state, struct device *dev, |
1017 | struct scatterlist *sg) |
1018 | { |
1019 | if (state->pgmap != sg_page(sg)->pgmap) { |
1020 | state->pgmap = sg_page(sg)->pgmap; |
1021 | state->map = pci_p2pdma_map_type(pgmap: state->pgmap, dev); |
1022 | state->bus_off = to_p2p_pgmap(pgmap: state->pgmap)->bus_offset; |
1023 | } |
1024 | |
1025 | if (state->map == PCI_P2PDMA_MAP_BUS_ADDR) { |
1026 | sg->dma_address = sg_phys(sg) + state->bus_off; |
1027 | sg_dma_len(sg) = sg->length; |
1028 | sg_dma_mark_bus_address(sg); |
1029 | } |
1030 | |
1031 | return state->map; |
1032 | } |
1033 | |
1034 | /** |
1035 | * pci_p2pdma_enable_store - parse a configfs/sysfs attribute store |
1036 | * to enable p2pdma |
1037 | * @page: contents of the value to be stored |
1038 | * @p2p_dev: returns the PCI device that was selected to be used |
1039 | * (if one was specified in the stored value) |
1040 | * @use_p2pdma: returns whether to enable p2pdma or not |
1041 | * |
1042 | * Parses an attribute value to decide whether to enable p2pdma. |
1043 | * The value can select a PCI device (using its full BDF device |
1044 | * name) or a boolean (in any format kstrtobool() accepts). A false |
1045 | * value disables p2pdma, a true value expects the caller |
1046 | * to automatically find a compatible device and specifying a PCI device |
1047 | * expects the caller to use the specific provider. |
1048 | * |
1049 | * pci_p2pdma_enable_show() should be used as the show operation for |
1050 | * the attribute. |
1051 | * |
1052 | * Returns 0 on success |
1053 | */ |
1054 | int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, |
1055 | bool *use_p2pdma) |
1056 | { |
1057 | struct device *dev; |
1058 | |
1059 | dev = bus_find_device_by_name(bus: &pci_bus_type, NULL, name: page); |
1060 | if (dev) { |
1061 | *use_p2pdma = true; |
1062 | *p2p_dev = to_pci_dev(dev); |
1063 | |
1064 | if (!pci_has_p2pmem(*p2p_dev)) { |
1065 | pci_err(*p2p_dev, |
1066 | "PCI device has no peer-to-peer memory: %s\n" , |
1067 | page); |
1068 | pci_dev_put(dev: *p2p_dev); |
1069 | return -ENODEV; |
1070 | } |
1071 | |
1072 | return 0; |
1073 | } else if ((page[0] == '0' || page[0] == '1') && !iscntrl(page[1])) { |
1074 | /* |
1075 | * If the user enters a PCI device that doesn't exist |
1076 | * like "0000:01:00.1", we don't want kstrtobool to think |
1077 | * it's a '0' when it's clearly not what the user wanted. |
1078 | * So we require 0's and 1's to be exactly one character. |
1079 | */ |
1080 | } else if (!kstrtobool(s: page, res: use_p2pdma)) { |
1081 | return 0; |
1082 | } |
1083 | |
1084 | pr_err("No such PCI device: %.*s\n" , (int)strcspn(page, "\n" ), page); |
1085 | return -ENODEV; |
1086 | } |
1087 | EXPORT_SYMBOL_GPL(pci_p2pdma_enable_store); |
1088 | |
1089 | /** |
1090 | * pci_p2pdma_enable_show - show a configfs/sysfs attribute indicating |
1091 | * whether p2pdma is enabled |
1092 | * @page: contents of the stored value |
1093 | * @p2p_dev: the selected p2p device (NULL if no device is selected) |
1094 | * @use_p2pdma: whether p2pdma has been enabled |
1095 | * |
1096 | * Attributes that use pci_p2pdma_enable_store() should use this function |
1097 | * to show the value of the attribute. |
1098 | * |
1099 | * Returns 0 on success |
1100 | */ |
1101 | ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, |
1102 | bool use_p2pdma) |
1103 | { |
1104 | if (!use_p2pdma) |
1105 | return sprintf(buf: page, fmt: "0\n" ); |
1106 | |
1107 | if (!p2p_dev) |
1108 | return sprintf(buf: page, fmt: "1\n" ); |
1109 | |
1110 | return sprintf(buf: page, fmt: "%s\n" , pci_name(pdev: p2p_dev)); |
1111 | } |
1112 | EXPORT_SYMBOL_GPL(pci_p2pdma_enable_show); |
1113 | |