1 | // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause |
2 | /* |
3 | * Copyright(c) 2023 - Cornelis Networks, Inc. |
4 | */ |
5 | |
6 | #include <linux/types.h> |
7 | |
8 | #include "hfi.h" |
9 | #include "common.h" |
10 | #include "device.h" |
11 | #include "pinning.h" |
12 | #include "mmu_rb.h" |
13 | #include "user_sdma.h" |
14 | #include "trace.h" |
15 | |
16 | struct sdma_mmu_node { |
17 | struct mmu_rb_node rb; |
18 | struct hfi1_user_sdma_pkt_q *pq; |
19 | struct page **pages; |
20 | unsigned int npages; |
21 | }; |
22 | |
23 | static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, |
24 | unsigned long len); |
25 | static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, void *arg2, |
26 | bool *stop); |
27 | static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode); |
28 | |
29 | static struct mmu_rb_ops sdma_rb_ops = { |
30 | .filter = sdma_rb_filter, |
31 | .evict = sdma_rb_evict, |
32 | .remove = sdma_rb_remove, |
33 | }; |
34 | |
35 | int hfi1_init_system_pinning(struct hfi1_user_sdma_pkt_q *pq) |
36 | { |
37 | struct hfi1_devdata *dd = pq->dd; |
38 | int ret; |
39 | |
40 | ret = hfi1_mmu_rb_register(ops_arg: pq, ops: &sdma_rb_ops, wq: dd->pport->hfi1_wq, |
41 | handler: &pq->handler); |
42 | if (ret) |
43 | dd_dev_err(dd, |
44 | "[%u:%u] Failed to register system memory DMA support with MMU: %d\n" , |
45 | pq->ctxt, pq->subctxt, ret); |
46 | return ret; |
47 | } |
48 | |
49 | void hfi1_free_system_pinning(struct hfi1_user_sdma_pkt_q *pq) |
50 | { |
51 | if (pq->handler) |
52 | hfi1_mmu_rb_unregister(handler: pq->handler); |
53 | } |
54 | |
55 | static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) |
56 | { |
57 | struct evict_data evict_data; |
58 | |
59 | evict_data.cleared = 0; |
60 | evict_data.target = npages; |
61 | hfi1_mmu_rb_evict(handler: pq->handler, evict_arg: &evict_data); |
62 | return evict_data.cleared; |
63 | } |
64 | |
65 | static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, |
66 | unsigned int start, unsigned int npages) |
67 | { |
68 | hfi1_release_user_pages(mm, p: pages + start, npages, dirty: false); |
69 | kfree(objp: pages); |
70 | } |
71 | |
72 | static inline struct mm_struct *mm_from_sdma_node(struct sdma_mmu_node *node) |
73 | { |
74 | return node->rb.handler->mn.mm; |
75 | } |
76 | |
77 | static void free_system_node(struct sdma_mmu_node *node) |
78 | { |
79 | if (node->npages) { |
80 | unpin_vector_pages(mm: mm_from_sdma_node(node), pages: node->pages, start: 0, |
81 | npages: node->npages); |
82 | atomic_sub(i: node->npages, v: &node->pq->n_locked); |
83 | } |
84 | kfree(objp: node); |
85 | } |
86 | |
87 | /* |
88 | * kref_get()'s an additional kref on the returned rb_node to prevent rb_node |
89 | * from being released until after rb_node is assigned to an SDMA descriptor |
90 | * (struct sdma_desc) under add_system_iovec_to_sdma_packet(), even if the |
91 | * virtual address range for rb_node is invalidated between now and then. |
92 | */ |
93 | static struct sdma_mmu_node *find_system_node(struct mmu_rb_handler *handler, |
94 | unsigned long start, |
95 | unsigned long end) |
96 | { |
97 | struct mmu_rb_node *rb_node; |
98 | unsigned long flags; |
99 | |
100 | spin_lock_irqsave(&handler->lock, flags); |
101 | rb_node = hfi1_mmu_rb_get_first(handler, addr: start, len: (end - start)); |
102 | if (!rb_node) { |
103 | spin_unlock_irqrestore(lock: &handler->lock, flags); |
104 | return NULL; |
105 | } |
106 | |
107 | /* "safety" kref to prevent release before add_system_iovec_to_sdma_packet() */ |
108 | kref_get(kref: &rb_node->refcount); |
109 | spin_unlock_irqrestore(lock: &handler->lock, flags); |
110 | |
111 | return container_of(rb_node, struct sdma_mmu_node, rb); |
112 | } |
113 | |
114 | static int pin_system_pages(struct user_sdma_request *req, |
115 | uintptr_t start_address, size_t length, |
116 | struct sdma_mmu_node *node, int npages) |
117 | { |
118 | struct hfi1_user_sdma_pkt_q *pq = req->pq; |
119 | int pinned, cleared; |
120 | struct page **pages; |
121 | |
122 | pages = kcalloc(n: npages, size: sizeof(*pages), GFP_KERNEL); |
123 | if (!pages) |
124 | return -ENOMEM; |
125 | |
126 | retry: |
127 | if (!hfi1_can_pin_pages(dd: pq->dd, current->mm, nlocked: atomic_read(v: &pq->n_locked), |
128 | npages)) { |
129 | SDMA_DBG(req, "Evicting: nlocked %u npages %u" , |
130 | atomic_read(&pq->n_locked), npages); |
131 | cleared = sdma_cache_evict(pq, npages); |
132 | if (cleared >= npages) |
133 | goto retry; |
134 | } |
135 | |
136 | SDMA_DBG(req, "Acquire user pages start_address %lx node->npages %u npages %u" , |
137 | start_address, node->npages, npages); |
138 | pinned = hfi1_acquire_user_pages(current->mm, vaddr: start_address, npages, writable: 0, |
139 | pages); |
140 | |
141 | if (pinned < 0) { |
142 | kfree(objp: pages); |
143 | SDMA_DBG(req, "pinned %d" , pinned); |
144 | return pinned; |
145 | } |
146 | if (pinned != npages) { |
147 | unpin_vector_pages(current->mm, pages, start: node->npages, npages: pinned); |
148 | SDMA_DBG(req, "npages %u pinned %d" , npages, pinned); |
149 | return -EFAULT; |
150 | } |
151 | node->rb.addr = start_address; |
152 | node->rb.len = length; |
153 | node->pages = pages; |
154 | node->npages = npages; |
155 | atomic_add(i: pinned, v: &pq->n_locked); |
156 | SDMA_DBG(req, "done. pinned %d" , pinned); |
157 | return 0; |
158 | } |
159 | |
160 | /* |
161 | * kref refcount on *node_p will be 2 on successful addition: one kref from |
162 | * kref_init() for mmu_rb_handler and one kref to prevent *node_p from being |
163 | * released until after *node_p is assigned to an SDMA descriptor (struct |
164 | * sdma_desc) under add_system_iovec_to_sdma_packet(), even if the virtual |
165 | * address range for *node_p is invalidated between now and then. |
166 | */ |
167 | static int add_system_pinning(struct user_sdma_request *req, |
168 | struct sdma_mmu_node **node_p, |
169 | unsigned long start, unsigned long len) |
170 | |
171 | { |
172 | struct hfi1_user_sdma_pkt_q *pq = req->pq; |
173 | struct sdma_mmu_node *node; |
174 | int ret; |
175 | |
176 | node = kzalloc(size: sizeof(*node), GFP_KERNEL); |
177 | if (!node) |
178 | return -ENOMEM; |
179 | |
180 | /* First kref "moves" to mmu_rb_handler */ |
181 | kref_init(kref: &node->rb.refcount); |
182 | |
183 | /* "safety" kref to prevent release before add_system_iovec_to_sdma_packet() */ |
184 | kref_get(kref: &node->rb.refcount); |
185 | |
186 | node->pq = pq; |
187 | ret = pin_system_pages(req, start_address: start, length: len, node, PFN_DOWN(len)); |
188 | if (ret == 0) { |
189 | ret = hfi1_mmu_rb_insert(handler: pq->handler, mnode: &node->rb); |
190 | if (ret) |
191 | free_system_node(node); |
192 | else |
193 | *node_p = node; |
194 | |
195 | return ret; |
196 | } |
197 | |
198 | kfree(objp: node); |
199 | return ret; |
200 | } |
201 | |
202 | static int get_system_cache_entry(struct user_sdma_request *req, |
203 | struct sdma_mmu_node **node_p, |
204 | size_t req_start, size_t req_len) |
205 | { |
206 | struct hfi1_user_sdma_pkt_q *pq = req->pq; |
207 | u64 start = ALIGN_DOWN(req_start, PAGE_SIZE); |
208 | u64 end = PFN_ALIGN(req_start + req_len); |
209 | int ret; |
210 | |
211 | if ((end - start) == 0) { |
212 | SDMA_DBG(req, |
213 | "Request for empty cache entry req_start %lx req_len %lx start %llx end %llx" , |
214 | req_start, req_len, start, end); |
215 | return -EINVAL; |
216 | } |
217 | |
218 | SDMA_DBG(req, "req_start %lx req_len %lu" , req_start, req_len); |
219 | |
220 | while (1) { |
221 | struct sdma_mmu_node *node = |
222 | find_system_node(handler: pq->handler, start, end); |
223 | u64 prepend_len = 0; |
224 | |
225 | SDMA_DBG(req, "node %p start %llx end %llu" , node, start, end); |
226 | if (!node) { |
227 | ret = add_system_pinning(req, node_p, start, |
228 | len: end - start); |
229 | if (ret == -EEXIST) { |
230 | /* |
231 | * Another execution context has inserted a |
232 | * conficting entry first. |
233 | */ |
234 | continue; |
235 | } |
236 | return ret; |
237 | } |
238 | |
239 | if (node->rb.addr <= start) { |
240 | /* |
241 | * This entry covers at least part of the region. If it doesn't extend |
242 | * to the end, then this will be called again for the next segment. |
243 | */ |
244 | *node_p = node; |
245 | return 0; |
246 | } |
247 | |
248 | SDMA_DBG(req, "prepend: node->rb.addr %lx, node->rb.refcount %d" , |
249 | node->rb.addr, kref_read(&node->rb.refcount)); |
250 | prepend_len = node->rb.addr - start; |
251 | |
252 | /* |
253 | * This node will not be returned, instead a new node |
254 | * will be. So release the reference. |
255 | */ |
256 | kref_put(kref: &node->rb.refcount, release: hfi1_mmu_rb_release); |
257 | |
258 | /* Prepend a node to cover the beginning of the allocation */ |
259 | ret = add_system_pinning(req, node_p, start, len: prepend_len); |
260 | if (ret == -EEXIST) { |
261 | /* Another execution context has inserted a conficting entry first. */ |
262 | continue; |
263 | } |
264 | return ret; |
265 | } |
266 | } |
267 | |
268 | static void sdma_mmu_rb_node_get(void *ctx) |
269 | { |
270 | struct mmu_rb_node *node = ctx; |
271 | |
272 | kref_get(kref: &node->refcount); |
273 | } |
274 | |
275 | static void sdma_mmu_rb_node_put(void *ctx) |
276 | { |
277 | struct sdma_mmu_node *node = ctx; |
278 | |
279 | kref_put(kref: &node->rb.refcount, release: hfi1_mmu_rb_release); |
280 | } |
281 | |
282 | static int add_mapping_to_sdma_packet(struct user_sdma_request *req, |
283 | struct user_sdma_txreq *tx, |
284 | struct sdma_mmu_node *cache_entry, |
285 | size_t start, |
286 | size_t from_this_cache_entry) |
287 | { |
288 | struct hfi1_user_sdma_pkt_q *pq = req->pq; |
289 | unsigned int page_offset; |
290 | unsigned int from_this_page; |
291 | size_t page_index; |
292 | void *ctx; |
293 | int ret; |
294 | |
295 | /* |
296 | * Because the cache may be more fragmented than the memory that is being accessed, |
297 | * it's not strictly necessary to have a descriptor per cache entry. |
298 | */ |
299 | |
300 | while (from_this_cache_entry) { |
301 | page_index = PFN_DOWN(start - cache_entry->rb.addr); |
302 | |
303 | if (page_index >= cache_entry->npages) { |
304 | SDMA_DBG(req, |
305 | "Request for page_index %zu >= cache_entry->npages %u" , |
306 | page_index, cache_entry->npages); |
307 | return -EINVAL; |
308 | } |
309 | |
310 | page_offset = start - ALIGN_DOWN(start, PAGE_SIZE); |
311 | from_this_page = PAGE_SIZE - page_offset; |
312 | |
313 | if (from_this_page < from_this_cache_entry) { |
314 | ctx = NULL; |
315 | } else { |
316 | /* |
317 | * In the case they are equal the next line has no practical effect, |
318 | * but it's better to do a register to register copy than a conditional |
319 | * branch. |
320 | */ |
321 | from_this_page = from_this_cache_entry; |
322 | ctx = cache_entry; |
323 | } |
324 | |
325 | ret = sdma_txadd_page(dd: pq->dd, tx: &tx->txreq, |
326 | page: cache_entry->pages[page_index], |
327 | offset: page_offset, len: from_this_page, |
328 | pinning_ctx: ctx, |
329 | ctx_get: sdma_mmu_rb_node_get, |
330 | ctx_put: sdma_mmu_rb_node_put); |
331 | if (ret) { |
332 | /* |
333 | * When there's a failure, the entire request is freed by |
334 | * user_sdma_send_pkts(). |
335 | */ |
336 | SDMA_DBG(req, |
337 | "sdma_txadd_page failed %d page_index %lu page_offset %u from_this_page %u" , |
338 | ret, page_index, page_offset, from_this_page); |
339 | return ret; |
340 | } |
341 | start += from_this_page; |
342 | from_this_cache_entry -= from_this_page; |
343 | } |
344 | return 0; |
345 | } |
346 | |
347 | static int add_system_iovec_to_sdma_packet(struct user_sdma_request *req, |
348 | struct user_sdma_txreq *tx, |
349 | struct user_sdma_iovec *iovec, |
350 | size_t from_this_iovec) |
351 | { |
352 | while (from_this_iovec > 0) { |
353 | struct sdma_mmu_node *cache_entry; |
354 | size_t from_this_cache_entry; |
355 | size_t start; |
356 | int ret; |
357 | |
358 | start = (uintptr_t)iovec->iov.iov_base + iovec->offset; |
359 | ret = get_system_cache_entry(req, node_p: &cache_entry, req_start: start, |
360 | req_len: from_this_iovec); |
361 | if (ret) { |
362 | SDMA_DBG(req, "pin system segment failed %d" , ret); |
363 | return ret; |
364 | } |
365 | |
366 | from_this_cache_entry = cache_entry->rb.len - (start - cache_entry->rb.addr); |
367 | if (from_this_cache_entry > from_this_iovec) |
368 | from_this_cache_entry = from_this_iovec; |
369 | |
370 | ret = add_mapping_to_sdma_packet(req, tx, cache_entry, start, |
371 | from_this_cache_entry); |
372 | |
373 | /* |
374 | * Done adding cache_entry to zero or more sdma_desc. Can |
375 | * kref_put() the "safety" kref taken under |
376 | * get_system_cache_entry(). |
377 | */ |
378 | kref_put(kref: &cache_entry->rb.refcount, release: hfi1_mmu_rb_release); |
379 | |
380 | if (ret) { |
381 | SDMA_DBG(req, "add system segment failed %d" , ret); |
382 | return ret; |
383 | } |
384 | |
385 | iovec->offset += from_this_cache_entry; |
386 | from_this_iovec -= from_this_cache_entry; |
387 | } |
388 | |
389 | return 0; |
390 | } |
391 | |
392 | /* |
393 | * Add up to pkt_data_remaining bytes to the txreq, starting at the current |
394 | * offset in the given iovec entry and continuing until all data has been added |
395 | * to the iovec or the iovec entry type changes. |
396 | * |
397 | * On success, prior to returning, adjust pkt_data_remaining, req->iov_idx, and |
398 | * the offset value in req->iov[req->iov_idx] to reflect the data that has been |
399 | * consumed. |
400 | */ |
401 | int hfi1_add_pages_to_sdma_packet(struct user_sdma_request *req, |
402 | struct user_sdma_txreq *tx, |
403 | struct user_sdma_iovec *iovec, |
404 | u32 *pkt_data_remaining) |
405 | { |
406 | size_t remaining_to_add = *pkt_data_remaining; |
407 | /* |
408 | * Walk through iovec entries, ensure the associated pages |
409 | * are pinned and mapped, add data to the packet until no more |
410 | * data remains to be added or the iovec entry type changes. |
411 | */ |
412 | while (remaining_to_add > 0) { |
413 | struct user_sdma_iovec *cur_iovec; |
414 | size_t from_this_iovec; |
415 | int ret; |
416 | |
417 | cur_iovec = iovec; |
418 | from_this_iovec = iovec->iov.iov_len - iovec->offset; |
419 | |
420 | if (from_this_iovec > remaining_to_add) { |
421 | from_this_iovec = remaining_to_add; |
422 | } else { |
423 | /* The current iovec entry will be consumed by this pass. */ |
424 | req->iov_idx++; |
425 | iovec++; |
426 | } |
427 | |
428 | ret = add_system_iovec_to_sdma_packet(req, tx, iovec: cur_iovec, |
429 | from_this_iovec); |
430 | if (ret) |
431 | return ret; |
432 | |
433 | remaining_to_add -= from_this_iovec; |
434 | } |
435 | *pkt_data_remaining = remaining_to_add; |
436 | |
437 | return 0; |
438 | } |
439 | |
440 | static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, |
441 | unsigned long len) |
442 | { |
443 | return (bool)(node->addr == addr); |
444 | } |
445 | |
446 | /* |
447 | * Return 1 to remove the node from the rb tree and call the remove op. |
448 | * |
449 | * Called with the rb tree lock held. |
450 | */ |
451 | static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, |
452 | void *evict_arg, bool *stop) |
453 | { |
454 | struct sdma_mmu_node *node = |
455 | container_of(mnode, struct sdma_mmu_node, rb); |
456 | struct evict_data *evict_data = evict_arg; |
457 | |
458 | /* this node will be evicted, add its pages to our count */ |
459 | evict_data->cleared += node->npages; |
460 | |
461 | /* have enough pages been cleared? */ |
462 | if (evict_data->cleared >= evict_data->target) |
463 | *stop = true; |
464 | |
465 | return 1; /* remove this node */ |
466 | } |
467 | |
468 | static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode) |
469 | { |
470 | struct sdma_mmu_node *node = |
471 | container_of(mnode, struct sdma_mmu_node, rb); |
472 | |
473 | free_system_node(node); |
474 | } |
475 | |