1 | // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause |
2 | /* |
3 | * Copyright(c) 2020 Cornelis Networks, Inc. |
4 | * Copyright(c) 2015-2018 Intel Corporation. |
5 | */ |
6 | #include <asm/page.h> |
7 | #include <linux/string.h> |
8 | |
9 | #include "mmu_rb.h" |
10 | #include "user_exp_rcv.h" |
11 | #include "trace.h" |
12 | |
13 | static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, |
14 | struct exp_tid_set *set, |
15 | struct hfi1_filedata *fd); |
16 | static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages); |
17 | static int set_rcvarray_entry(struct hfi1_filedata *fd, |
18 | struct tid_user_buf *tbuf, |
19 | u32 rcventry, struct tid_group *grp, |
20 | u16 pageidx, unsigned int npages); |
21 | static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, |
22 | struct tid_rb_node *tnode); |
23 | static bool tid_rb_invalidate(struct mmu_interval_notifier *mni, |
24 | const struct mmu_notifier_range *range, |
25 | unsigned long cur_seq); |
26 | static bool tid_cover_invalidate(struct mmu_interval_notifier *mni, |
27 | const struct mmu_notifier_range *range, |
28 | unsigned long cur_seq); |
29 | static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *, |
30 | struct tid_group *grp, u16 count, |
31 | u32 *tidlist, unsigned int *tididx, |
32 | unsigned int *pmapped); |
33 | static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo); |
34 | static void __clear_tid_node(struct hfi1_filedata *fd, |
35 | struct tid_rb_node *node); |
36 | static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node); |
37 | |
38 | static const struct mmu_interval_notifier_ops tid_mn_ops = { |
39 | .invalidate = tid_rb_invalidate, |
40 | }; |
41 | static const struct mmu_interval_notifier_ops tid_cover_ops = { |
42 | .invalidate = tid_cover_invalidate, |
43 | }; |
44 | |
45 | /* |
46 | * Initialize context and file private data needed for Expected |
47 | * receive caching. This needs to be done after the context has |
48 | * been configured with the eager/expected RcvEntry counts. |
49 | */ |
50 | int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd, |
51 | struct hfi1_ctxtdata *uctxt) |
52 | { |
53 | int ret = 0; |
54 | |
55 | fd->entry_to_rb = kcalloc(n: uctxt->expected_count, |
56 | size: sizeof(struct rb_node *), |
57 | GFP_KERNEL); |
58 | if (!fd->entry_to_rb) |
59 | return -ENOMEM; |
60 | |
61 | if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) { |
62 | fd->invalid_tid_idx = 0; |
63 | fd->invalid_tids = kcalloc(n: uctxt->expected_count, |
64 | size: sizeof(*fd->invalid_tids), |
65 | GFP_KERNEL); |
66 | if (!fd->invalid_tids) { |
67 | kfree(objp: fd->entry_to_rb); |
68 | fd->entry_to_rb = NULL; |
69 | return -ENOMEM; |
70 | } |
71 | fd->use_mn = true; |
72 | } |
73 | |
74 | /* |
75 | * PSM does not have a good way to separate, count, and |
76 | * effectively enforce a limit on RcvArray entries used by |
77 | * subctxts (when context sharing is used) when TID caching |
78 | * is enabled. To help with that, we calculate a per-process |
79 | * RcvArray entry share and enforce that. |
80 | * If TID caching is not in use, PSM deals with usage on its |
81 | * own. In that case, we allow any subctxt to take all of the |
82 | * entries. |
83 | * |
84 | * Make sure that we set the tid counts only after successful |
85 | * init. |
86 | */ |
87 | spin_lock(lock: &fd->tid_lock); |
88 | if (uctxt->subctxt_cnt && fd->use_mn) { |
89 | u16 remainder; |
90 | |
91 | fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt; |
92 | remainder = uctxt->expected_count % uctxt->subctxt_cnt; |
93 | if (remainder && fd->subctxt < remainder) |
94 | fd->tid_limit++; |
95 | } else { |
96 | fd->tid_limit = uctxt->expected_count; |
97 | } |
98 | spin_unlock(lock: &fd->tid_lock); |
99 | |
100 | return ret; |
101 | } |
102 | |
103 | void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd) |
104 | { |
105 | struct hfi1_ctxtdata *uctxt = fd->uctxt; |
106 | |
107 | mutex_lock(&uctxt->exp_mutex); |
108 | if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list)) |
109 | unlock_exp_tids(uctxt, set: &uctxt->tid_full_list, fd); |
110 | if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list)) |
111 | unlock_exp_tids(uctxt, set: &uctxt->tid_used_list, fd); |
112 | mutex_unlock(lock: &uctxt->exp_mutex); |
113 | |
114 | kfree(objp: fd->invalid_tids); |
115 | fd->invalid_tids = NULL; |
116 | |
117 | kfree(objp: fd->entry_to_rb); |
118 | fd->entry_to_rb = NULL; |
119 | } |
120 | |
121 | /* |
122 | * Release pinned receive buffer pages. |
123 | * |
124 | * @mapped: true if the pages have been DMA mapped. false otherwise. |
125 | * @idx: Index of the first page to unpin. |
126 | * @npages: No of pages to unpin. |
127 | * |
128 | * If the pages have been DMA mapped (indicated by mapped parameter), their |
129 | * info will be passed via a struct tid_rb_node. If they haven't been mapped, |
130 | * their info will be passed via a struct tid_user_buf. |
131 | */ |
132 | static void unpin_rcv_pages(struct hfi1_filedata *fd, |
133 | struct tid_user_buf *tidbuf, |
134 | struct tid_rb_node *node, |
135 | unsigned int idx, |
136 | unsigned int npages, |
137 | bool mapped) |
138 | { |
139 | struct page **pages; |
140 | struct hfi1_devdata *dd = fd->uctxt->dd; |
141 | struct mm_struct *mm; |
142 | |
143 | if (mapped) { |
144 | dma_unmap_single(&dd->pcidev->dev, node->dma_addr, |
145 | node->npages * PAGE_SIZE, DMA_FROM_DEVICE); |
146 | pages = &node->pages[idx]; |
147 | mm = mm_from_tid_node(node); |
148 | } else { |
149 | pages = &tidbuf->pages[idx]; |
150 | mm = current->mm; |
151 | } |
152 | hfi1_release_user_pages(mm, p: pages, npages, dirty: mapped); |
153 | fd->tid_n_pinned -= npages; |
154 | } |
155 | |
156 | /* |
157 | * Pin receive buffer pages. |
158 | */ |
159 | static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf) |
160 | { |
161 | int pinned; |
162 | unsigned int npages = tidbuf->npages; |
163 | unsigned long vaddr = tidbuf->vaddr; |
164 | struct page **pages = NULL; |
165 | struct hfi1_devdata *dd = fd->uctxt->dd; |
166 | |
167 | if (npages > fd->uctxt->expected_count) { |
168 | dd_dev_err(dd, "Expected buffer too big\n" ); |
169 | return -EINVAL; |
170 | } |
171 | |
172 | /* Allocate the array of struct page pointers needed for pinning */ |
173 | pages = kcalloc(n: npages, size: sizeof(*pages), GFP_KERNEL); |
174 | if (!pages) |
175 | return -ENOMEM; |
176 | |
177 | /* |
178 | * Pin all the pages of the user buffer. If we can't pin all the |
179 | * pages, accept the amount pinned so far and program only that. |
180 | * User space knows how to deal with partially programmed buffers. |
181 | */ |
182 | if (!hfi1_can_pin_pages(dd, current->mm, nlocked: fd->tid_n_pinned, npages)) { |
183 | kfree(objp: pages); |
184 | return -ENOMEM; |
185 | } |
186 | |
187 | pinned = hfi1_acquire_user_pages(current->mm, vaddr, npages, writable: true, pages); |
188 | if (pinned <= 0) { |
189 | kfree(objp: pages); |
190 | return pinned; |
191 | } |
192 | tidbuf->pages = pages; |
193 | fd->tid_n_pinned += pinned; |
194 | return pinned; |
195 | } |
196 | |
197 | /* |
198 | * RcvArray entry allocation for Expected Receives is done by the |
199 | * following algorithm: |
200 | * |
201 | * The context keeps 3 lists of groups of RcvArray entries: |
202 | * 1. List of empty groups - tid_group_list |
203 | * This list is created during user context creation and |
204 | * contains elements which describe sets (of 8) of empty |
205 | * RcvArray entries. |
206 | * 2. List of partially used groups - tid_used_list |
207 | * This list contains sets of RcvArray entries which are |
208 | * not completely used up. Another mapping request could |
209 | * use some of all of the remaining entries. |
210 | * 3. List of full groups - tid_full_list |
211 | * This is the list where sets that are completely used |
212 | * up go. |
213 | * |
214 | * An attempt to optimize the usage of RcvArray entries is |
215 | * made by finding all sets of physically contiguous pages in a |
216 | * user's buffer. |
217 | * These physically contiguous sets are further split into |
218 | * sizes supported by the receive engine of the HFI. The |
219 | * resulting sets of pages are stored in struct tid_pageset, |
220 | * which describes the sets as: |
221 | * * .count - number of pages in this set |
222 | * * .idx - starting index into struct page ** array |
223 | * of this set |
224 | * |
225 | * From this point on, the algorithm deals with the page sets |
226 | * described above. The number of pagesets is divided by the |
227 | * RcvArray group size to produce the number of full groups |
228 | * needed. |
229 | * |
230 | * Groups from the 3 lists are manipulated using the following |
231 | * rules: |
232 | * 1. For each set of 8 pagesets, a complete group from |
233 | * tid_group_list is taken, programmed, and moved to |
234 | * the tid_full_list list. |
235 | * 2. For all remaining pagesets: |
236 | * 2.1 If the tid_used_list is empty and the tid_group_list |
237 | * is empty, stop processing pageset and return only |
238 | * what has been programmed up to this point. |
239 | * 2.2 If the tid_used_list is empty and the tid_group_list |
240 | * is not empty, move a group from tid_group_list to |
241 | * tid_used_list. |
242 | * 2.3 For each group is tid_used_group, program as much as |
243 | * can fit into the group. If the group becomes fully |
244 | * used, move it to tid_full_list. |
245 | */ |
246 | int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, |
247 | struct hfi1_tid_info *tinfo) |
248 | { |
249 | int ret = 0, need_group = 0, pinned; |
250 | struct hfi1_ctxtdata *uctxt = fd->uctxt; |
251 | struct hfi1_devdata *dd = uctxt->dd; |
252 | unsigned int ngroups, pageset_count, |
253 | tididx = 0, mapped, mapped_pages = 0; |
254 | u32 *tidlist = NULL; |
255 | struct tid_user_buf *tidbuf; |
256 | unsigned long mmu_seq = 0; |
257 | |
258 | if (!PAGE_ALIGNED(tinfo->vaddr)) |
259 | return -EINVAL; |
260 | if (tinfo->length == 0) |
261 | return -EINVAL; |
262 | |
263 | tidbuf = kzalloc(size: sizeof(*tidbuf), GFP_KERNEL); |
264 | if (!tidbuf) |
265 | return -ENOMEM; |
266 | |
267 | mutex_init(&tidbuf->cover_mutex); |
268 | tidbuf->vaddr = tinfo->vaddr; |
269 | tidbuf->length = tinfo->length; |
270 | tidbuf->npages = num_user_pages(addr: tidbuf->vaddr, len: tidbuf->length); |
271 | tidbuf->psets = kcalloc(n: uctxt->expected_count, size: sizeof(*tidbuf->psets), |
272 | GFP_KERNEL); |
273 | if (!tidbuf->psets) { |
274 | ret = -ENOMEM; |
275 | goto fail_release_mem; |
276 | } |
277 | |
278 | if (fd->use_mn) { |
279 | ret = mmu_interval_notifier_insert( |
280 | interval_sub: &tidbuf->notifier, current->mm, |
281 | start: tidbuf->vaddr, length: tidbuf->npages * PAGE_SIZE, |
282 | ops: &tid_cover_ops); |
283 | if (ret) |
284 | goto fail_release_mem; |
285 | mmu_seq = mmu_interval_read_begin(interval_sub: &tidbuf->notifier); |
286 | } |
287 | |
288 | pinned = pin_rcv_pages(fd, tidbuf); |
289 | if (pinned <= 0) { |
290 | ret = (pinned < 0) ? pinned : -ENOSPC; |
291 | goto fail_unpin; |
292 | } |
293 | |
294 | /* Find sets of physically contiguous pages */ |
295 | tidbuf->n_psets = find_phys_blocks(tidbuf, npages: pinned); |
296 | |
297 | /* Reserve the number of expected tids to be used. */ |
298 | spin_lock(lock: &fd->tid_lock); |
299 | if (fd->tid_used + tidbuf->n_psets > fd->tid_limit) |
300 | pageset_count = fd->tid_limit - fd->tid_used; |
301 | else |
302 | pageset_count = tidbuf->n_psets; |
303 | fd->tid_used += pageset_count; |
304 | spin_unlock(lock: &fd->tid_lock); |
305 | |
306 | if (!pageset_count) { |
307 | ret = -ENOSPC; |
308 | goto fail_unreserve; |
309 | } |
310 | |
311 | ngroups = pageset_count / dd->rcv_entries.group_size; |
312 | tidlist = kcalloc(n: pageset_count, size: sizeof(*tidlist), GFP_KERNEL); |
313 | if (!tidlist) { |
314 | ret = -ENOMEM; |
315 | goto fail_unreserve; |
316 | } |
317 | |
318 | tididx = 0; |
319 | |
320 | /* |
321 | * From this point on, we are going to be using shared (between master |
322 | * and subcontexts) context resources. We need to take the lock. |
323 | */ |
324 | mutex_lock(&uctxt->exp_mutex); |
325 | /* |
326 | * The first step is to program the RcvArray entries which are complete |
327 | * groups. |
328 | */ |
329 | while (ngroups && uctxt->tid_group_list.count) { |
330 | struct tid_group *grp = |
331 | tid_group_pop(set: &uctxt->tid_group_list); |
332 | |
333 | ret = program_rcvarray(fd, tidbuf, grp, |
334 | count: dd->rcv_entries.group_size, |
335 | tidlist, tididx: &tididx, pmapped: &mapped); |
336 | /* |
337 | * If there was a failure to program the RcvArray |
338 | * entries for the entire group, reset the grp fields |
339 | * and add the grp back to the free group list. |
340 | */ |
341 | if (ret <= 0) { |
342 | tid_group_add_tail(grp, set: &uctxt->tid_group_list); |
343 | hfi1_cdbg(TID, |
344 | "Failed to program RcvArray group %d" , ret); |
345 | goto unlock; |
346 | } |
347 | |
348 | tid_group_add_tail(grp, set: &uctxt->tid_full_list); |
349 | ngroups--; |
350 | mapped_pages += mapped; |
351 | } |
352 | |
353 | while (tididx < pageset_count) { |
354 | struct tid_group *grp, *ptr; |
355 | /* |
356 | * If we don't have any partially used tid groups, check |
357 | * if we have empty groups. If so, take one from there and |
358 | * put in the partially used list. |
359 | */ |
360 | if (!uctxt->tid_used_list.count || need_group) { |
361 | if (!uctxt->tid_group_list.count) |
362 | goto unlock; |
363 | |
364 | grp = tid_group_pop(set: &uctxt->tid_group_list); |
365 | tid_group_add_tail(grp, set: &uctxt->tid_used_list); |
366 | need_group = 0; |
367 | } |
368 | /* |
369 | * There is an optimization opportunity here - instead of |
370 | * fitting as many page sets as we can, check for a group |
371 | * later on in the list that could fit all of them. |
372 | */ |
373 | list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list, |
374 | list) { |
375 | unsigned use = min_t(unsigned, pageset_count - tididx, |
376 | grp->size - grp->used); |
377 | |
378 | ret = program_rcvarray(fd, tidbuf, grp, |
379 | count: use, tidlist, |
380 | tididx: &tididx, pmapped: &mapped); |
381 | if (ret < 0) { |
382 | hfi1_cdbg(TID, |
383 | "Failed to program RcvArray entries %d" , |
384 | ret); |
385 | goto unlock; |
386 | } else if (ret > 0) { |
387 | if (grp->used == grp->size) |
388 | tid_group_move(group: grp, |
389 | s1: &uctxt->tid_used_list, |
390 | s2: &uctxt->tid_full_list); |
391 | mapped_pages += mapped; |
392 | need_group = 0; |
393 | /* Check if we are done so we break out early */ |
394 | if (tididx >= pageset_count) |
395 | break; |
396 | } else if (WARN_ON(ret == 0)) { |
397 | /* |
398 | * If ret is 0, we did not program any entries |
399 | * into this group, which can only happen if |
400 | * we've screwed up the accounting somewhere. |
401 | * Warn and try to continue. |
402 | */ |
403 | need_group = 1; |
404 | } |
405 | } |
406 | } |
407 | unlock: |
408 | mutex_unlock(lock: &uctxt->exp_mutex); |
409 | hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)" , tididx, |
410 | mapped_pages, ret); |
411 | |
412 | /* fail if nothing was programmed, set error if none provided */ |
413 | if (tididx == 0) { |
414 | if (ret >= 0) |
415 | ret = -ENOSPC; |
416 | goto fail_unreserve; |
417 | } |
418 | |
419 | /* adjust reserved tid_used to actual count */ |
420 | spin_lock(lock: &fd->tid_lock); |
421 | fd->tid_used -= pageset_count - tididx; |
422 | spin_unlock(lock: &fd->tid_lock); |
423 | |
424 | /* unpin all pages not covered by a TID */ |
425 | unpin_rcv_pages(fd, tidbuf, NULL, idx: mapped_pages, npages: pinned - mapped_pages, |
426 | mapped: false); |
427 | |
428 | if (fd->use_mn) { |
429 | /* check for an invalidate during setup */ |
430 | bool fail = false; |
431 | |
432 | mutex_lock(&tidbuf->cover_mutex); |
433 | fail = mmu_interval_read_retry(interval_sub: &tidbuf->notifier, seq: mmu_seq); |
434 | mutex_unlock(lock: &tidbuf->cover_mutex); |
435 | |
436 | if (fail) { |
437 | ret = -EBUSY; |
438 | goto fail_unprogram; |
439 | } |
440 | } |
441 | |
442 | tinfo->tidcnt = tididx; |
443 | tinfo->length = mapped_pages * PAGE_SIZE; |
444 | |
445 | if (copy_to_user(u64_to_user_ptr(tinfo->tidlist), |
446 | from: tidlist, n: sizeof(tidlist[0]) * tididx)) { |
447 | ret = -EFAULT; |
448 | goto fail_unprogram; |
449 | } |
450 | |
451 | if (fd->use_mn) |
452 | mmu_interval_notifier_remove(interval_sub: &tidbuf->notifier); |
453 | kfree(objp: tidbuf->pages); |
454 | kfree(objp: tidbuf->psets); |
455 | kfree(objp: tidbuf); |
456 | kfree(objp: tidlist); |
457 | return 0; |
458 | |
459 | fail_unprogram: |
460 | /* unprogram, unmap, and unpin all allocated TIDs */ |
461 | tinfo->tidlist = (unsigned long)tidlist; |
462 | hfi1_user_exp_rcv_clear(fd, tinfo); |
463 | tinfo->tidlist = 0; |
464 | pinned = 0; /* nothing left to unpin */ |
465 | pageset_count = 0; /* nothing left reserved */ |
466 | fail_unreserve: |
467 | spin_lock(lock: &fd->tid_lock); |
468 | fd->tid_used -= pageset_count; |
469 | spin_unlock(lock: &fd->tid_lock); |
470 | fail_unpin: |
471 | if (fd->use_mn) |
472 | mmu_interval_notifier_remove(interval_sub: &tidbuf->notifier); |
473 | if (pinned > 0) |
474 | unpin_rcv_pages(fd, tidbuf, NULL, idx: 0, npages: pinned, mapped: false); |
475 | fail_release_mem: |
476 | kfree(objp: tidbuf->pages); |
477 | kfree(objp: tidbuf->psets); |
478 | kfree(objp: tidbuf); |
479 | kfree(objp: tidlist); |
480 | return ret; |
481 | } |
482 | |
483 | int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd, |
484 | struct hfi1_tid_info *tinfo) |
485 | { |
486 | int ret = 0; |
487 | struct hfi1_ctxtdata *uctxt = fd->uctxt; |
488 | u32 *tidinfo; |
489 | unsigned tididx; |
490 | |
491 | if (unlikely(tinfo->tidcnt > fd->tid_used)) |
492 | return -EINVAL; |
493 | |
494 | tidinfo = memdup_array_user(u64_to_user_ptr(tinfo->tidlist), |
495 | n: tinfo->tidcnt, size: sizeof(tidinfo[0])); |
496 | if (IS_ERR(ptr: tidinfo)) |
497 | return PTR_ERR(ptr: tidinfo); |
498 | |
499 | mutex_lock(&uctxt->exp_mutex); |
500 | for (tididx = 0; tididx < tinfo->tidcnt; tididx++) { |
501 | ret = unprogram_rcvarray(fd, tidinfo: tidinfo[tididx]); |
502 | if (ret) { |
503 | hfi1_cdbg(TID, "Failed to unprogram rcv array %d" , |
504 | ret); |
505 | break; |
506 | } |
507 | } |
508 | spin_lock(lock: &fd->tid_lock); |
509 | fd->tid_used -= tididx; |
510 | spin_unlock(lock: &fd->tid_lock); |
511 | tinfo->tidcnt = tididx; |
512 | mutex_unlock(lock: &uctxt->exp_mutex); |
513 | |
514 | kfree(objp: tidinfo); |
515 | return ret; |
516 | } |
517 | |
518 | int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd, |
519 | struct hfi1_tid_info *tinfo) |
520 | { |
521 | struct hfi1_ctxtdata *uctxt = fd->uctxt; |
522 | unsigned long *ev = uctxt->dd->events + |
523 | (uctxt_offset(uctxt) + fd->subctxt); |
524 | u32 *array; |
525 | int ret = 0; |
526 | |
527 | /* |
528 | * copy_to_user() can sleep, which will leave the invalid_lock |
529 | * locked and cause the MMU notifier to be blocked on the lock |
530 | * for a long time. |
531 | * Copy the data to a local buffer so we can release the lock. |
532 | */ |
533 | array = kcalloc(n: uctxt->expected_count, size: sizeof(*array), GFP_KERNEL); |
534 | if (!array) |
535 | return -EFAULT; |
536 | |
537 | spin_lock(lock: &fd->invalid_lock); |
538 | if (fd->invalid_tid_idx) { |
539 | memcpy(array, fd->invalid_tids, sizeof(*array) * |
540 | fd->invalid_tid_idx); |
541 | memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) * |
542 | fd->invalid_tid_idx); |
543 | tinfo->tidcnt = fd->invalid_tid_idx; |
544 | fd->invalid_tid_idx = 0; |
545 | /* |
546 | * Reset the user flag while still holding the lock. |
547 | * Otherwise, PSM can miss events. |
548 | */ |
549 | clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, addr: ev); |
550 | } else { |
551 | tinfo->tidcnt = 0; |
552 | } |
553 | spin_unlock(lock: &fd->invalid_lock); |
554 | |
555 | if (tinfo->tidcnt) { |
556 | if (copy_to_user(to: (void __user *)tinfo->tidlist, |
557 | from: array, n: sizeof(*array) * tinfo->tidcnt)) |
558 | ret = -EFAULT; |
559 | } |
560 | kfree(objp: array); |
561 | |
562 | return ret; |
563 | } |
564 | |
565 | static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages) |
566 | { |
567 | unsigned pagecount, pageidx, setcount = 0, i; |
568 | unsigned long pfn, this_pfn; |
569 | struct page **pages = tidbuf->pages; |
570 | struct tid_pageset *list = tidbuf->psets; |
571 | |
572 | if (!npages) |
573 | return 0; |
574 | |
575 | /* |
576 | * Look for sets of physically contiguous pages in the user buffer. |
577 | * This will allow us to optimize Expected RcvArray entry usage by |
578 | * using the bigger supported sizes. |
579 | */ |
580 | pfn = page_to_pfn(pages[0]); |
581 | for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) { |
582 | this_pfn = i < npages ? page_to_pfn(pages[i]) : 0; |
583 | |
584 | /* |
585 | * If the pfn's are not sequential, pages are not physically |
586 | * contiguous. |
587 | */ |
588 | if (this_pfn != ++pfn) { |
589 | /* |
590 | * At this point we have to loop over the set of |
591 | * physically contiguous pages and break them down it |
592 | * sizes supported by the HW. |
593 | * There are two main constraints: |
594 | * 1. The max buffer size is MAX_EXPECTED_BUFFER. |
595 | * If the total set size is bigger than that |
596 | * program only a MAX_EXPECTED_BUFFER chunk. |
597 | * 2. The buffer size has to be a power of two. If |
598 | * it is not, round down to the closes power of |
599 | * 2 and program that size. |
600 | */ |
601 | while (pagecount) { |
602 | int maxpages = pagecount; |
603 | u32 bufsize = pagecount * PAGE_SIZE; |
604 | |
605 | if (bufsize > MAX_EXPECTED_BUFFER) |
606 | maxpages = |
607 | MAX_EXPECTED_BUFFER >> |
608 | PAGE_SHIFT; |
609 | else if (!is_power_of_2(n: bufsize)) |
610 | maxpages = |
611 | rounddown_pow_of_two(bufsize) >> |
612 | PAGE_SHIFT; |
613 | |
614 | list[setcount].idx = pageidx; |
615 | list[setcount].count = maxpages; |
616 | pagecount -= maxpages; |
617 | pageidx += maxpages; |
618 | setcount++; |
619 | } |
620 | pageidx = i; |
621 | pagecount = 1; |
622 | pfn = this_pfn; |
623 | } else { |
624 | pagecount++; |
625 | } |
626 | } |
627 | return setcount; |
628 | } |
629 | |
630 | /** |
631 | * program_rcvarray() - program an RcvArray group with receive buffers |
632 | * @fd: filedata pointer |
633 | * @tbuf: pointer to struct tid_user_buf that has the user buffer starting |
634 | * virtual address, buffer length, page pointers, pagesets (array of |
635 | * struct tid_pageset holding information on physically contiguous |
636 | * chunks from the user buffer), and other fields. |
637 | * @grp: RcvArray group |
638 | * @count: number of struct tid_pageset's to program |
639 | * @tidlist: the array of u32 elements when the information about the |
640 | * programmed RcvArray entries is to be encoded. |
641 | * @tididx: starting offset into tidlist |
642 | * @pmapped: (output parameter) number of pages programmed into the RcvArray |
643 | * entries. |
644 | * |
645 | * This function will program up to 'count' number of RcvArray entries from the |
646 | * group 'grp'. To make best use of write-combining writes, the function will |
647 | * perform writes to the unused RcvArray entries which will be ignored by the |
648 | * HW. Each RcvArray entry will be programmed with a physically contiguous |
649 | * buffer chunk from the user's virtual buffer. |
650 | * |
651 | * Return: |
652 | * -EINVAL if the requested count is larger than the size of the group, |
653 | * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or |
654 | * number of RcvArray entries programmed. |
655 | */ |
656 | static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf, |
657 | struct tid_group *grp, u16 count, |
658 | u32 *tidlist, unsigned int *tididx, |
659 | unsigned int *pmapped) |
660 | { |
661 | struct hfi1_ctxtdata *uctxt = fd->uctxt; |
662 | struct hfi1_devdata *dd = uctxt->dd; |
663 | u16 idx; |
664 | unsigned int start = *tididx; |
665 | u32 tidinfo = 0, rcventry, useidx = 0; |
666 | int mapped = 0; |
667 | |
668 | /* Count should never be larger than the group size */ |
669 | if (count > grp->size) |
670 | return -EINVAL; |
671 | |
672 | /* Find the first unused entry in the group */ |
673 | for (idx = 0; idx < grp->size; idx++) { |
674 | if (!(grp->map & (1 << idx))) { |
675 | useidx = idx; |
676 | break; |
677 | } |
678 | rcv_array_wc_fill(dd, index: grp->base + idx); |
679 | } |
680 | |
681 | idx = 0; |
682 | while (idx < count) { |
683 | u16 npages, pageidx, setidx = start + idx; |
684 | int ret = 0; |
685 | |
686 | /* |
687 | * If this entry in the group is used, move to the next one. |
688 | * If we go past the end of the group, exit the loop. |
689 | */ |
690 | if (useidx >= grp->size) { |
691 | break; |
692 | } else if (grp->map & (1 << useidx)) { |
693 | rcv_array_wc_fill(dd, index: grp->base + useidx); |
694 | useidx++; |
695 | continue; |
696 | } |
697 | |
698 | rcventry = grp->base + useidx; |
699 | npages = tbuf->psets[setidx].count; |
700 | pageidx = tbuf->psets[setidx].idx; |
701 | |
702 | ret = set_rcvarray_entry(fd, tbuf, |
703 | rcventry, grp, pageidx, |
704 | npages); |
705 | if (ret) |
706 | return ret; |
707 | mapped += npages; |
708 | |
709 | tidinfo = create_tid(rcventry: rcventry - uctxt->expected_base, npages); |
710 | tidlist[(*tididx)++] = tidinfo; |
711 | grp->used++; |
712 | grp->map |= 1 << useidx++; |
713 | idx++; |
714 | } |
715 | |
716 | /* Fill the rest of the group with "blank" writes */ |
717 | for (; useidx < grp->size; useidx++) |
718 | rcv_array_wc_fill(dd, index: grp->base + useidx); |
719 | *pmapped = mapped; |
720 | return idx; |
721 | } |
722 | |
723 | static int set_rcvarray_entry(struct hfi1_filedata *fd, |
724 | struct tid_user_buf *tbuf, |
725 | u32 rcventry, struct tid_group *grp, |
726 | u16 pageidx, unsigned int npages) |
727 | { |
728 | int ret; |
729 | struct hfi1_ctxtdata *uctxt = fd->uctxt; |
730 | struct tid_rb_node *node; |
731 | struct hfi1_devdata *dd = uctxt->dd; |
732 | dma_addr_t phys; |
733 | struct page **pages = tbuf->pages + pageidx; |
734 | |
735 | /* |
736 | * Allocate the node first so we can handle a potential |
737 | * failure before we've programmed anything. |
738 | */ |
739 | node = kzalloc(struct_size(node, pages, npages), GFP_KERNEL); |
740 | if (!node) |
741 | return -ENOMEM; |
742 | |
743 | phys = dma_map_single(&dd->pcidev->dev, __va(page_to_phys(pages[0])), |
744 | npages * PAGE_SIZE, DMA_FROM_DEVICE); |
745 | if (dma_mapping_error(dev: &dd->pcidev->dev, dma_addr: phys)) { |
746 | dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n" , |
747 | phys); |
748 | kfree(objp: node); |
749 | return -EFAULT; |
750 | } |
751 | |
752 | node->fdata = fd; |
753 | mutex_init(&node->invalidate_mutex); |
754 | node->phys = page_to_phys(pages[0]); |
755 | node->npages = npages; |
756 | node->rcventry = rcventry; |
757 | node->dma_addr = phys; |
758 | node->grp = grp; |
759 | node->freed = false; |
760 | memcpy(node->pages, pages, flex_array_size(node, pages, npages)); |
761 | |
762 | if (fd->use_mn) { |
763 | ret = mmu_interval_notifier_insert( |
764 | interval_sub: &node->notifier, current->mm, |
765 | start: tbuf->vaddr + (pageidx * PAGE_SIZE), length: npages * PAGE_SIZE, |
766 | ops: &tid_mn_ops); |
767 | if (ret) |
768 | goto out_unmap; |
769 | } |
770 | fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node; |
771 | |
772 | hfi1_put_tid(dd, index: rcventry, PT_EXPECTED, pa: phys, ilog2(npages) + 1); |
773 | trace_hfi1_exp_tid_reg(ctxt: uctxt->ctxt, subctxt: fd->subctxt, rarr: rcventry, npages, |
774 | va: node->notifier.interval_tree.start, pa: node->phys, |
775 | dma: phys); |
776 | return 0; |
777 | |
778 | out_unmap: |
779 | hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d" , |
780 | node->rcventry, node->notifier.interval_tree.start, |
781 | node->phys, ret); |
782 | dma_unmap_single(&dd->pcidev->dev, phys, npages * PAGE_SIZE, |
783 | DMA_FROM_DEVICE); |
784 | kfree(objp: node); |
785 | return -EFAULT; |
786 | } |
787 | |
788 | static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo) |
789 | { |
790 | struct hfi1_ctxtdata *uctxt = fd->uctxt; |
791 | struct hfi1_devdata *dd = uctxt->dd; |
792 | struct tid_rb_node *node; |
793 | u32 tidctrl = EXP_TID_GET(tidinfo, CTRL); |
794 | u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry; |
795 | |
796 | if (tidctrl == 0x3 || tidctrl == 0x0) |
797 | return -EINVAL; |
798 | |
799 | rcventry = tididx + (tidctrl - 1); |
800 | |
801 | if (rcventry >= uctxt->expected_count) { |
802 | dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n" , |
803 | rcventry, uctxt->ctxt); |
804 | return -EINVAL; |
805 | } |
806 | |
807 | node = fd->entry_to_rb[rcventry]; |
808 | if (!node || node->rcventry != (uctxt->expected_base + rcventry)) |
809 | return -EBADF; |
810 | |
811 | if (fd->use_mn) |
812 | mmu_interval_notifier_remove(interval_sub: &node->notifier); |
813 | cacheless_tid_rb_remove(fdata: fd, tnode: node); |
814 | |
815 | return 0; |
816 | } |
817 | |
818 | static void __clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) |
819 | { |
820 | struct hfi1_ctxtdata *uctxt = fd->uctxt; |
821 | struct hfi1_devdata *dd = uctxt->dd; |
822 | |
823 | mutex_lock(&node->invalidate_mutex); |
824 | if (node->freed) |
825 | goto done; |
826 | node->freed = true; |
827 | |
828 | trace_hfi1_exp_tid_unreg(ctxt: uctxt->ctxt, subctxt: fd->subctxt, rarr: node->rcventry, |
829 | npages: node->npages, |
830 | va: node->notifier.interval_tree.start, pa: node->phys, |
831 | dma: node->dma_addr); |
832 | |
833 | /* Make sure device has seen the write before pages are unpinned */ |
834 | hfi1_put_tid(dd, index: node->rcventry, PT_INVALID_FLUSH, pa: 0, order: 0); |
835 | |
836 | unpin_rcv_pages(fd, NULL, node, idx: 0, npages: node->npages, mapped: true); |
837 | done: |
838 | mutex_unlock(lock: &node->invalidate_mutex); |
839 | } |
840 | |
841 | static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) |
842 | { |
843 | struct hfi1_ctxtdata *uctxt = fd->uctxt; |
844 | |
845 | __clear_tid_node(fd, node); |
846 | |
847 | node->grp->used--; |
848 | node->grp->map &= ~(1 << (node->rcventry - node->grp->base)); |
849 | |
850 | if (node->grp->used == node->grp->size - 1) |
851 | tid_group_move(group: node->grp, s1: &uctxt->tid_full_list, |
852 | s2: &uctxt->tid_used_list); |
853 | else if (!node->grp->used) |
854 | tid_group_move(group: node->grp, s1: &uctxt->tid_used_list, |
855 | s2: &uctxt->tid_group_list); |
856 | kfree(objp: node); |
857 | } |
858 | |
859 | /* |
860 | * As a simple helper for hfi1_user_exp_rcv_free, this function deals with |
861 | * clearing nodes in the non-cached case. |
862 | */ |
863 | static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, |
864 | struct exp_tid_set *set, |
865 | struct hfi1_filedata *fd) |
866 | { |
867 | struct tid_group *grp, *ptr; |
868 | int i; |
869 | |
870 | list_for_each_entry_safe(grp, ptr, &set->list, list) { |
871 | list_del_init(entry: &grp->list); |
872 | |
873 | for (i = 0; i < grp->size; i++) { |
874 | if (grp->map & (1 << i)) { |
875 | u16 rcventry = grp->base + i; |
876 | struct tid_rb_node *node; |
877 | |
878 | node = fd->entry_to_rb[rcventry - |
879 | uctxt->expected_base]; |
880 | if (!node || node->rcventry != rcventry) |
881 | continue; |
882 | |
883 | if (fd->use_mn) |
884 | mmu_interval_notifier_remove( |
885 | interval_sub: &node->notifier); |
886 | cacheless_tid_rb_remove(fdata: fd, tnode: node); |
887 | } |
888 | } |
889 | } |
890 | } |
891 | |
892 | static bool tid_rb_invalidate(struct mmu_interval_notifier *mni, |
893 | const struct mmu_notifier_range *range, |
894 | unsigned long cur_seq) |
895 | { |
896 | struct tid_rb_node *node = |
897 | container_of(mni, struct tid_rb_node, notifier); |
898 | struct hfi1_filedata *fdata = node->fdata; |
899 | struct hfi1_ctxtdata *uctxt = fdata->uctxt; |
900 | |
901 | if (node->freed) |
902 | return true; |
903 | |
904 | /* take action only if unmapping */ |
905 | if (range->event != MMU_NOTIFY_UNMAP) |
906 | return true; |
907 | |
908 | trace_hfi1_exp_tid_inval(ctxt: uctxt->ctxt, subctxt: fdata->subctxt, |
909 | va: node->notifier.interval_tree.start, |
910 | rarr: node->rcventry, npages: node->npages, dma: node->dma_addr); |
911 | |
912 | /* clear the hardware rcvarray entry */ |
913 | __clear_tid_node(fd: fdata, node); |
914 | |
915 | spin_lock(lock: &fdata->invalid_lock); |
916 | if (fdata->invalid_tid_idx < uctxt->expected_count) { |
917 | fdata->invalid_tids[fdata->invalid_tid_idx] = |
918 | create_tid(rcventry: node->rcventry - uctxt->expected_base, |
919 | npages: node->npages); |
920 | if (!fdata->invalid_tid_idx) { |
921 | unsigned long *ev; |
922 | |
923 | /* |
924 | * hfi1_set_uevent_bits() sets a user event flag |
925 | * for all processes. Because calling into the |
926 | * driver to process TID cache invalidations is |
927 | * expensive and TID cache invalidations are |
928 | * handled on a per-process basis, we can |
929 | * optimize this to set the flag only for the |
930 | * process in question. |
931 | */ |
932 | ev = uctxt->dd->events + |
933 | (uctxt_offset(uctxt) + fdata->subctxt); |
934 | set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, addr: ev); |
935 | } |
936 | fdata->invalid_tid_idx++; |
937 | } |
938 | spin_unlock(lock: &fdata->invalid_lock); |
939 | return true; |
940 | } |
941 | |
942 | static bool tid_cover_invalidate(struct mmu_interval_notifier *mni, |
943 | const struct mmu_notifier_range *range, |
944 | unsigned long cur_seq) |
945 | { |
946 | struct tid_user_buf *tidbuf = |
947 | container_of(mni, struct tid_user_buf, notifier); |
948 | |
949 | /* take action only if unmapping */ |
950 | if (range->event == MMU_NOTIFY_UNMAP) { |
951 | mutex_lock(&tidbuf->cover_mutex); |
952 | mmu_interval_set_seq(interval_sub: mni, cur_seq); |
953 | mutex_unlock(lock: &tidbuf->cover_mutex); |
954 | } |
955 | |
956 | return true; |
957 | } |
958 | |
959 | static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, |
960 | struct tid_rb_node *tnode) |
961 | { |
962 | u32 base = fdata->uctxt->expected_base; |
963 | |
964 | fdata->entry_to_rb[tnode->rcventry - base] = NULL; |
965 | clear_tid_node(fd: fdata, node: tnode); |
966 | } |
967 | |