1 | // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB |
2 | /* |
3 | * Copyright (c) 2005 Mellanox Technologies. All rights reserved. |
4 | * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved. |
5 | * Copyright 2019 Marvell. All rights reserved. |
6 | */ |
7 | #include <linux/xarray.h> |
8 | #include "uverbs.h" |
9 | #include "core_priv.h" |
10 | |
11 | /** |
12 | * rdma_umap_priv_init() - Initialize the private data of a vma |
13 | * |
14 | * @priv: The already allocated private data |
15 | * @vma: The vm area struct that needs private data |
16 | * @entry: entry into the mmap_xa that needs to be linked with |
17 | * this vma |
18 | * |
19 | * Each time we map IO memory into user space this keeps track of the |
20 | * mapping. When the device is hot-unplugged we 'zap' the mmaps in user space |
21 | * to point to the zero page and allow the hot unplug to proceed. |
22 | * |
23 | * This is necessary for cases like PCI physical hot unplug as the actual BAR |
24 | * memory may vanish after this and access to it from userspace could MCE. |
25 | * |
26 | * RDMA drivers supporting disassociation must have their user space designed |
27 | * to cope in some way with their IO pages going to the zero page. |
28 | * |
29 | */ |
30 | void rdma_umap_priv_init(struct rdma_umap_priv *priv, |
31 | struct vm_area_struct *vma, |
32 | struct rdma_user_mmap_entry *entry) |
33 | { |
34 | struct ib_uverbs_file *ufile = vma->vm_file->private_data; |
35 | |
36 | priv->vma = vma; |
37 | if (entry) { |
38 | kref_get(kref: &entry->ref); |
39 | priv->entry = entry; |
40 | } |
41 | vma->vm_private_data = priv; |
42 | /* vm_ops is setup in ib_uverbs_mmap() to avoid module dependencies */ |
43 | |
44 | mutex_lock(&ufile->umap_lock); |
45 | list_add(new: &priv->list, head: &ufile->umaps); |
46 | mutex_unlock(lock: &ufile->umap_lock); |
47 | } |
48 | EXPORT_SYMBOL(rdma_umap_priv_init); |
49 | |
50 | /** |
51 | * rdma_user_mmap_io() - Map IO memory into a process |
52 | * |
53 | * @ucontext: associated user context |
54 | * @vma: the vma related to the current mmap call |
55 | * @pfn: pfn to map |
56 | * @size: size to map |
57 | * @prot: pgprot to use in remap call |
58 | * @entry: mmap_entry retrieved from rdma_user_mmap_entry_get(), or NULL |
59 | * if mmap_entry is not used by the driver |
60 | * |
61 | * This is to be called by drivers as part of their mmap() functions if they |
62 | * wish to send something like PCI-E BAR memory to userspace. |
63 | * |
64 | * Return -EINVAL on wrong flags or size, -EAGAIN on failure to map. 0 on |
65 | * success. |
66 | */ |
67 | int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, |
68 | unsigned long pfn, unsigned long size, pgprot_t prot, |
69 | struct rdma_user_mmap_entry *entry) |
70 | { |
71 | struct ib_uverbs_file *ufile = ucontext->ufile; |
72 | struct rdma_umap_priv *priv; |
73 | |
74 | if (!(vma->vm_flags & VM_SHARED)) |
75 | return -EINVAL; |
76 | |
77 | if (vma->vm_end - vma->vm_start != size) |
78 | return -EINVAL; |
79 | |
80 | /* Driver is using this wrong, must be called by ib_uverbs_mmap */ |
81 | if (WARN_ON(!vma->vm_file || |
82 | vma->vm_file->private_data != ufile)) |
83 | return -EINVAL; |
84 | lockdep_assert_held(&ufile->device->disassociate_srcu); |
85 | |
86 | priv = kzalloc(size: sizeof(*priv), GFP_KERNEL); |
87 | if (!priv) |
88 | return -ENOMEM; |
89 | |
90 | vma->vm_page_prot = prot; |
91 | if (io_remap_pfn_range(vma, addr: vma->vm_start, pfn, size, prot)) { |
92 | kfree(objp: priv); |
93 | return -EAGAIN; |
94 | } |
95 | |
96 | rdma_umap_priv_init(priv, vma, entry); |
97 | return 0; |
98 | } |
99 | EXPORT_SYMBOL(rdma_user_mmap_io); |
100 | |
101 | /** |
102 | * rdma_user_mmap_entry_get_pgoff() - Get an entry from the mmap_xa |
103 | * |
104 | * @ucontext: associated user context |
105 | * @pgoff: The mmap offset >> PAGE_SHIFT |
106 | * |
107 | * This function is called when a user tries to mmap with an offset (returned |
108 | * by rdma_user_mmap_get_offset()) it initially received from the driver. The |
109 | * rdma_user_mmap_entry was created by the function |
110 | * rdma_user_mmap_entry_insert(). This function increases the refcnt of the |
111 | * entry so that it won't be deleted from the xarray in the meantime. |
112 | * |
113 | * Return an reference to an entry if exists or NULL if there is no |
114 | * match. rdma_user_mmap_entry_put() must be called to put the reference. |
115 | */ |
116 | struct rdma_user_mmap_entry * |
117 | rdma_user_mmap_entry_get_pgoff(struct ib_ucontext *ucontext, |
118 | unsigned long pgoff) |
119 | { |
120 | struct rdma_user_mmap_entry *entry; |
121 | |
122 | if (pgoff > U32_MAX) |
123 | return NULL; |
124 | |
125 | xa_lock(&ucontext->mmap_xa); |
126 | |
127 | entry = xa_load(&ucontext->mmap_xa, index: pgoff); |
128 | |
129 | /* |
130 | * If refcount is zero, entry is already being deleted, driver_removed |
131 | * indicates that the no further mmaps are possible and we waiting for |
132 | * the active VMAs to be closed. |
133 | */ |
134 | if (!entry || entry->start_pgoff != pgoff || entry->driver_removed || |
135 | !kref_get_unless_zero(kref: &entry->ref)) |
136 | goto err; |
137 | |
138 | xa_unlock(&ucontext->mmap_xa); |
139 | |
140 | ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#zx] returned\n" , |
141 | pgoff, entry->npages); |
142 | |
143 | return entry; |
144 | |
145 | err: |
146 | xa_unlock(&ucontext->mmap_xa); |
147 | return NULL; |
148 | } |
149 | EXPORT_SYMBOL(rdma_user_mmap_entry_get_pgoff); |
150 | |
151 | /** |
152 | * rdma_user_mmap_entry_get() - Get an entry from the mmap_xa |
153 | * |
154 | * @ucontext: associated user context |
155 | * @vma: the vma being mmap'd into |
156 | * |
157 | * This function is like rdma_user_mmap_entry_get_pgoff() except that it also |
158 | * checks that the VMA is correct. |
159 | */ |
160 | struct rdma_user_mmap_entry * |
161 | rdma_user_mmap_entry_get(struct ib_ucontext *ucontext, |
162 | struct vm_area_struct *vma) |
163 | { |
164 | struct rdma_user_mmap_entry *entry; |
165 | |
166 | if (!(vma->vm_flags & VM_SHARED)) |
167 | return NULL; |
168 | entry = rdma_user_mmap_entry_get_pgoff(ucontext, vma->vm_pgoff); |
169 | if (!entry) |
170 | return NULL; |
171 | if (entry->npages * PAGE_SIZE != vma->vm_end - vma->vm_start) { |
172 | rdma_user_mmap_entry_put(entry); |
173 | return NULL; |
174 | } |
175 | return entry; |
176 | } |
177 | EXPORT_SYMBOL(rdma_user_mmap_entry_get); |
178 | |
179 | static void rdma_user_mmap_entry_free(struct kref *kref) |
180 | { |
181 | struct rdma_user_mmap_entry *entry = |
182 | container_of(kref, struct rdma_user_mmap_entry, ref); |
183 | struct ib_ucontext *ucontext = entry->ucontext; |
184 | unsigned long i; |
185 | |
186 | /* |
187 | * Erase all entries occupied by this single entry, this is deferred |
188 | * until all VMA are closed so that the mmap offsets remain unique. |
189 | */ |
190 | xa_lock(&ucontext->mmap_xa); |
191 | for (i = 0; i < entry->npages; i++) |
192 | __xa_erase(&ucontext->mmap_xa, index: entry->start_pgoff + i); |
193 | xa_unlock(&ucontext->mmap_xa); |
194 | |
195 | ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#zx] removed\n" , |
196 | entry->start_pgoff, entry->npages); |
197 | |
198 | if (ucontext->device->ops.mmap_free) |
199 | ucontext->device->ops.mmap_free(entry); |
200 | } |
201 | |
202 | /** |
203 | * rdma_user_mmap_entry_put() - Drop reference to the mmap entry |
204 | * |
205 | * @entry: an entry in the mmap_xa |
206 | * |
207 | * This function is called when the mapping is closed if it was |
208 | * an io mapping or when the driver is done with the entry for |
209 | * some other reason. |
210 | * Should be called after rdma_user_mmap_entry_get was called |
211 | * and entry is no longer needed. This function will erase the |
212 | * entry and free it if its refcnt reaches zero. |
213 | */ |
214 | void rdma_user_mmap_entry_put(struct rdma_user_mmap_entry *entry) |
215 | { |
216 | kref_put(kref: &entry->ref, release: rdma_user_mmap_entry_free); |
217 | } |
218 | EXPORT_SYMBOL(rdma_user_mmap_entry_put); |
219 | |
220 | /** |
221 | * rdma_user_mmap_entry_remove() - Drop reference to entry and |
222 | * mark it as unmmapable |
223 | * |
224 | * @entry: the entry to insert into the mmap_xa |
225 | * |
226 | * Drivers can call this to prevent userspace from creating more mappings for |
227 | * entry, however existing mmaps continue to exist and ops->mmap_free() will |
228 | * not be called until all user mmaps are destroyed. |
229 | */ |
230 | void rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry) |
231 | { |
232 | if (!entry) |
233 | return; |
234 | |
235 | xa_lock(&entry->ucontext->mmap_xa); |
236 | entry->driver_removed = true; |
237 | xa_unlock(&entry->ucontext->mmap_xa); |
238 | kref_put(kref: &entry->ref, release: rdma_user_mmap_entry_free); |
239 | } |
240 | EXPORT_SYMBOL(rdma_user_mmap_entry_remove); |
241 | |
242 | /** |
243 | * rdma_user_mmap_entry_insert_range() - Insert an entry to the mmap_xa |
244 | * in a given range. |
245 | * |
246 | * @ucontext: associated user context. |
247 | * @entry: the entry to insert into the mmap_xa |
248 | * @length: length of the address that will be mmapped |
249 | * @min_pgoff: minimum pgoff to be returned |
250 | * @max_pgoff: maximum pgoff to be returned |
251 | * |
252 | * This function should be called by drivers that use the rdma_user_mmap |
253 | * interface for implementing their mmap syscall A database of mmap offsets is |
254 | * handled in the core and helper functions are provided to insert entries |
255 | * into the database and extract entries when the user calls mmap with the |
256 | * given offset. The function allocates a unique page offset in a given range |
257 | * that should be provided to user, the user will use the offset to retrieve |
258 | * information such as address to be mapped and how. |
259 | * |
260 | * Return: 0 on success and -ENOMEM on failure |
261 | */ |
262 | int rdma_user_mmap_entry_insert_range(struct ib_ucontext *ucontext, |
263 | struct rdma_user_mmap_entry *entry, |
264 | size_t length, u32 min_pgoff, |
265 | u32 max_pgoff) |
266 | { |
267 | struct ib_uverbs_file *ufile = ucontext->ufile; |
268 | XA_STATE(xas, &ucontext->mmap_xa, min_pgoff); |
269 | u32 xa_first, xa_last, npages; |
270 | int err; |
271 | u32 i; |
272 | |
273 | if (!entry) |
274 | return -EINVAL; |
275 | |
276 | kref_init(kref: &entry->ref); |
277 | entry->ucontext = ucontext; |
278 | |
279 | /* |
280 | * We want the whole allocation to be done without interruption from a |
281 | * different thread. The allocation requires finding a free range and |
282 | * storing. During the xa_insert the lock could be released, possibly |
283 | * allowing another thread to choose the same range. |
284 | */ |
285 | mutex_lock(&ufile->umap_lock); |
286 | |
287 | xa_lock(&ucontext->mmap_xa); |
288 | |
289 | /* We want to find an empty range */ |
290 | npages = (u32)DIV_ROUND_UP(length, PAGE_SIZE); |
291 | entry->npages = npages; |
292 | while (true) { |
293 | /* First find an empty index */ |
294 | xas_find_marked(&xas, max: max_pgoff, XA_FREE_MARK); |
295 | if (xas.xa_node == XAS_RESTART) |
296 | goto err_unlock; |
297 | |
298 | xa_first = xas.xa_index; |
299 | |
300 | /* Is there enough room to have the range? */ |
301 | if (check_add_overflow(xa_first, npages, &xa_last)) |
302 | goto err_unlock; |
303 | |
304 | /* |
305 | * Now look for the next present entry. If an entry doesn't |
306 | * exist, we found an empty range and can proceed. |
307 | */ |
308 | xas_next_entry(xas: &xas, max: xa_last - 1); |
309 | if (xas.xa_node == XAS_BOUNDS || xas.xa_index >= xa_last) |
310 | break; |
311 | } |
312 | |
313 | for (i = xa_first; i < xa_last; i++) { |
314 | err = __xa_insert(&ucontext->mmap_xa, index: i, entry, GFP_KERNEL); |
315 | if (err) |
316 | goto err_undo; |
317 | } |
318 | |
319 | /* |
320 | * Internally the kernel uses a page offset, in libc this is a byte |
321 | * offset. Drivers should not return pgoff to userspace. |
322 | */ |
323 | entry->start_pgoff = xa_first; |
324 | xa_unlock(&ucontext->mmap_xa); |
325 | mutex_unlock(lock: &ufile->umap_lock); |
326 | |
327 | ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#x] inserted\n" , |
328 | entry->start_pgoff, npages); |
329 | |
330 | return 0; |
331 | |
332 | err_undo: |
333 | for (; i > xa_first; i--) |
334 | __xa_erase(&ucontext->mmap_xa, index: i - 1); |
335 | |
336 | err_unlock: |
337 | xa_unlock(&ucontext->mmap_xa); |
338 | mutex_unlock(lock: &ufile->umap_lock); |
339 | return -ENOMEM; |
340 | } |
341 | EXPORT_SYMBOL(rdma_user_mmap_entry_insert_range); |
342 | |
343 | /** |
344 | * rdma_user_mmap_entry_insert() - Insert an entry to the mmap_xa. |
345 | * |
346 | * @ucontext: associated user context. |
347 | * @entry: the entry to insert into the mmap_xa |
348 | * @length: length of the address that will be mmapped |
349 | * |
350 | * This function should be called by drivers that use the rdma_user_mmap |
351 | * interface for handling user mmapped addresses. The database is handled in |
352 | * the core and helper functions are provided to insert entries into the |
353 | * database and extract entries when the user calls mmap with the given offset. |
354 | * The function allocates a unique page offset that should be provided to user, |
355 | * the user will use the offset to retrieve information such as address to |
356 | * be mapped and how. |
357 | * |
358 | * Return: 0 on success and -ENOMEM on failure |
359 | */ |
360 | int rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext, |
361 | struct rdma_user_mmap_entry *entry, |
362 | size_t length) |
363 | { |
364 | return rdma_user_mmap_entry_insert_range(ucontext, entry, length, 0, |
365 | U32_MAX); |
366 | } |
367 | EXPORT_SYMBOL(rdma_user_mmap_entry_insert); |
368 | |