1 | /* |
2 | * Copyright (c) 2005 Topspin Communications. All rights reserved. |
3 | * Copyright (c) 2005 Cisco Systems. All rights reserved. |
4 | * Copyright (c) 2005 Mellanox Technologies. All rights reserved. |
5 | * Copyright (c) 2020 Intel Corporation. All rights reserved. |
6 | * |
7 | * This software is available to you under a choice of one of two |
8 | * licenses. You may choose to be licensed under the terms of the GNU |
9 | * General Public License (GPL) Version 2, available from the file |
10 | * COPYING in the main directory of this source tree, or the |
11 | * OpenIB.org BSD license below: |
12 | * |
13 | * Redistribution and use in source and binary forms, with or |
14 | * without modification, are permitted provided that the following |
15 | * conditions are met: |
16 | * |
17 | * - Redistributions of source code must retain the above |
18 | * copyright notice, this list of conditions and the following |
19 | * disclaimer. |
20 | * |
21 | * - Redistributions in binary form must reproduce the above |
22 | * copyright notice, this list of conditions and the following |
23 | * disclaimer in the documentation and/or other materials |
24 | * provided with the distribution. |
25 | * |
26 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
27 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
28 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
29 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
30 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
31 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
32 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
33 | * SOFTWARE. |
34 | */ |
35 | |
36 | #include <linux/mm.h> |
37 | #include <linux/dma-mapping.h> |
38 | #include <linux/sched/signal.h> |
39 | #include <linux/sched/mm.h> |
40 | #include <linux/export.h> |
41 | #include <linux/slab.h> |
42 | #include <linux/pagemap.h> |
43 | #include <linux/count_zeros.h> |
44 | #include <rdma/ib_umem_odp.h> |
45 | |
46 | #include "uverbs.h" |
47 | |
48 | static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) |
49 | { |
50 | bool make_dirty = umem->writable && dirty; |
51 | struct scatterlist *sg; |
52 | unsigned int i; |
53 | |
54 | if (dirty) |
55 | ib_dma_unmap_sgtable_attrs(dev, sgt: &umem->sgt_append.sgt, |
56 | direction: DMA_BIDIRECTIONAL, dma_attrs: 0); |
57 | |
58 | for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i) |
59 | unpin_user_page_range_dirty_lock(page: sg_page(sg), |
60 | DIV_ROUND_UP(sg->length, PAGE_SIZE), make_dirty); |
61 | |
62 | sg_free_append_table(sgt: &umem->sgt_append); |
63 | } |
64 | |
65 | /** |
66 | * ib_umem_find_best_pgsz - Find best HW page size to use for this MR |
67 | * |
68 | * @umem: umem struct |
69 | * @pgsz_bitmap: bitmap of HW supported page sizes |
70 | * @virt: IOVA |
71 | * |
72 | * This helper is intended for HW that support multiple page |
73 | * sizes but can do only a single page size in an MR. |
74 | * |
75 | * Returns 0 if the umem requires page sizes not supported by |
76 | * the driver to be mapped. Drivers always supporting PAGE_SIZE |
77 | * or smaller will never see a 0 result. |
78 | */ |
79 | unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, |
80 | unsigned long pgsz_bitmap, |
81 | unsigned long virt) |
82 | { |
83 | struct scatterlist *sg; |
84 | unsigned long va, pgoff; |
85 | dma_addr_t mask; |
86 | int i; |
87 | |
88 | umem->iova = va = virt; |
89 | |
90 | if (umem->is_odp) { |
91 | unsigned int page_size = BIT(to_ib_umem_odp(umem)->page_shift); |
92 | |
93 | /* ODP must always be self consistent. */ |
94 | if (!(pgsz_bitmap & page_size)) |
95 | return 0; |
96 | return page_size; |
97 | } |
98 | |
99 | /* The best result is the smallest page size that results in the minimum |
100 | * number of required pages. Compute the largest page size that could |
101 | * work based on VA address bits that don't change. |
102 | */ |
103 | mask = pgsz_bitmap & |
104 | GENMASK(BITS_PER_LONG - 1, |
105 | bits_per((umem->length - 1 + virt) ^ virt)); |
106 | /* offset into first SGL */ |
107 | pgoff = umem->address & ~PAGE_MASK; |
108 | |
109 | for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) { |
110 | /* Walk SGL and reduce max page size if VA/PA bits differ |
111 | * for any address. |
112 | */ |
113 | mask |= (sg_dma_address(sg) + pgoff) ^ va; |
114 | va += sg_dma_len(sg) - pgoff; |
115 | /* Except for the last entry, the ending iova alignment sets |
116 | * the maximum possible page size as the low bits of the iova |
117 | * must be zero when starting the next chunk. |
118 | */ |
119 | if (i != (umem->sgt_append.sgt.nents - 1)) |
120 | mask |= va; |
121 | pgoff = 0; |
122 | } |
123 | |
124 | /* The mask accumulates 1's in each position where the VA and physical |
125 | * address differ, thus the length of trailing 0 is the largest page |
126 | * size that can pass the VA through to the physical. |
127 | */ |
128 | if (mask) |
129 | pgsz_bitmap &= GENMASK(count_trailing_zeros(mask), 0); |
130 | return pgsz_bitmap ? rounddown_pow_of_two(pgsz_bitmap) : 0; |
131 | } |
132 | EXPORT_SYMBOL(ib_umem_find_best_pgsz); |
133 | |
134 | /** |
135 | * ib_umem_get - Pin and DMA map userspace memory. |
136 | * |
137 | * @device: IB device to connect UMEM |
138 | * @addr: userspace virtual address to start at |
139 | * @size: length of region to pin |
140 | * @access: IB_ACCESS_xxx flags for memory being pinned |
141 | */ |
142 | struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, |
143 | size_t size, int access) |
144 | { |
145 | struct ib_umem *umem; |
146 | struct page **page_list; |
147 | unsigned long lock_limit; |
148 | unsigned long new_pinned; |
149 | unsigned long cur_base; |
150 | unsigned long dma_attr = 0; |
151 | struct mm_struct *mm; |
152 | unsigned long npages; |
153 | int pinned, ret; |
154 | unsigned int gup_flags = FOLL_LONGTERM; |
155 | |
156 | /* |
157 | * If the combination of the addr and size requested for this memory |
158 | * region causes an integer overflow, return error. |
159 | */ |
160 | if (((addr + size) < addr) || |
161 | PAGE_ALIGN(addr + size) < (addr + size)) |
162 | return ERR_PTR(error: -EINVAL); |
163 | |
164 | if (!can_do_mlock()) |
165 | return ERR_PTR(error: -EPERM); |
166 | |
167 | if (access & IB_ACCESS_ON_DEMAND) |
168 | return ERR_PTR(error: -EOPNOTSUPP); |
169 | |
170 | umem = kzalloc(size: sizeof(*umem), GFP_KERNEL); |
171 | if (!umem) |
172 | return ERR_PTR(error: -ENOMEM); |
173 | umem->ibdev = device; |
174 | umem->length = size; |
175 | umem->address = addr; |
176 | /* |
177 | * Drivers should call ib_umem_find_best_pgsz() to set the iova |
178 | * correctly. |
179 | */ |
180 | umem->iova = addr; |
181 | umem->writable = ib_access_writable(access_flags: access); |
182 | umem->owning_mm = mm = current->mm; |
183 | mmgrab(mm); |
184 | |
185 | page_list = (struct page **) __get_free_page(GFP_KERNEL); |
186 | if (!page_list) { |
187 | ret = -ENOMEM; |
188 | goto umem_kfree; |
189 | } |
190 | |
191 | npages = ib_umem_num_pages(umem); |
192 | if (npages == 0 || npages > UINT_MAX) { |
193 | ret = -EINVAL; |
194 | goto out; |
195 | } |
196 | |
197 | lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; |
198 | |
199 | new_pinned = atomic64_add_return(i: npages, v: &mm->pinned_vm); |
200 | if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) { |
201 | atomic64_sub(i: npages, v: &mm->pinned_vm); |
202 | ret = -ENOMEM; |
203 | goto out; |
204 | } |
205 | |
206 | cur_base = addr & PAGE_MASK; |
207 | |
208 | if (umem->writable) |
209 | gup_flags |= FOLL_WRITE; |
210 | |
211 | while (npages) { |
212 | cond_resched(); |
213 | pinned = pin_user_pages_fast(start: cur_base, |
214 | min_t(unsigned long, npages, |
215 | PAGE_SIZE / |
216 | sizeof(struct page *)), |
217 | gup_flags, pages: page_list); |
218 | if (pinned < 0) { |
219 | ret = pinned; |
220 | goto umem_release; |
221 | } |
222 | |
223 | cur_base += pinned * PAGE_SIZE; |
224 | npages -= pinned; |
225 | ret = sg_alloc_append_table_from_pages( |
226 | sgt: &umem->sgt_append, pages: page_list, n_pages: pinned, offset: 0, |
227 | size: pinned << PAGE_SHIFT, max_segment: ib_dma_max_seg_size(dev: device), |
228 | left_pages: npages, GFP_KERNEL); |
229 | if (ret) { |
230 | unpin_user_pages_dirty_lock(pages: page_list, npages: pinned, make_dirty: 0); |
231 | goto umem_release; |
232 | } |
233 | } |
234 | |
235 | if (access & IB_ACCESS_RELAXED_ORDERING) |
236 | dma_attr |= DMA_ATTR_WEAK_ORDERING; |
237 | |
238 | ret = ib_dma_map_sgtable_attrs(dev: device, sgt: &umem->sgt_append.sgt, |
239 | direction: DMA_BIDIRECTIONAL, dma_attrs: dma_attr); |
240 | if (ret) |
241 | goto umem_release; |
242 | goto out; |
243 | |
244 | umem_release: |
245 | __ib_umem_release(dev: device, umem, dirty: 0); |
246 | atomic64_sub(i: ib_umem_num_pages(umem), v: &mm->pinned_vm); |
247 | out: |
248 | free_page((unsigned long) page_list); |
249 | umem_kfree: |
250 | if (ret) { |
251 | mmdrop(mm: umem->owning_mm); |
252 | kfree(objp: umem); |
253 | } |
254 | return ret ? ERR_PTR(error: ret) : umem; |
255 | } |
256 | EXPORT_SYMBOL(ib_umem_get); |
257 | |
258 | /** |
259 | * ib_umem_release - release memory pinned with ib_umem_get |
260 | * @umem: umem struct to release |
261 | */ |
262 | void ib_umem_release(struct ib_umem *umem) |
263 | { |
264 | if (!umem) |
265 | return; |
266 | if (umem->is_dmabuf) |
267 | return ib_umem_dmabuf_release(umem_dmabuf: to_ib_umem_dmabuf(umem)); |
268 | if (umem->is_odp) |
269 | return ib_umem_odp_release(umem_odp: to_ib_umem_odp(umem)); |
270 | |
271 | __ib_umem_release(dev: umem->ibdev, umem, dirty: 1); |
272 | |
273 | atomic64_sub(i: ib_umem_num_pages(umem), v: &umem->owning_mm->pinned_vm); |
274 | mmdrop(mm: umem->owning_mm); |
275 | kfree(objp: umem); |
276 | } |
277 | EXPORT_SYMBOL(ib_umem_release); |
278 | |
279 | /* |
280 | * Copy from the given ib_umem's pages to the given buffer. |
281 | * |
282 | * umem - the umem to copy from |
283 | * offset - offset to start copying from |
284 | * dst - destination buffer |
285 | * length - buffer length |
286 | * |
287 | * Returns 0 on success, or an error code. |
288 | */ |
289 | int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, |
290 | size_t length) |
291 | { |
292 | size_t end = offset + length; |
293 | int ret; |
294 | |
295 | if (offset > umem->length || length > umem->length - offset) { |
296 | pr_err("%s not in range. offset: %zd umem length: %zd end: %zd\n" , |
297 | __func__, offset, umem->length, end); |
298 | return -EINVAL; |
299 | } |
300 | |
301 | ret = sg_pcopy_to_buffer(sgl: umem->sgt_append.sgt.sgl, |
302 | nents: umem->sgt_append.sgt.orig_nents, buf: dst, buflen: length, |
303 | skip: offset + ib_umem_offset(umem)); |
304 | |
305 | if (ret < 0) |
306 | return ret; |
307 | else if (ret != length) |
308 | return -EINVAL; |
309 | else |
310 | return 0; |
311 | } |
312 | EXPORT_SYMBOL(ib_umem_copy_from); |
313 | |