user_exp_rcv.c source code [linux/drivers/infiniband/hw/hfi1/user_exp_rcv.c]

1	// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2	/*
3	* Copyright(c) 2020 Cornelis Networks, Inc.
4	* Copyright(c) 2015-2018 Intel Corporation.
5	*/
6	#include <asm/page.h>
7	#include <linux/string.h>
8
9	#include "mmu_rb.h"
10	#include "user_exp_rcv.h"
11	#include "trace.h"
12
13	static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
14	struct exp_tid_set *set,
15	struct hfi1_filedata *fd);
16	static u32 find_phys_blocks(struct tid_user_buf tidbuf, unsigned* int npages);
17	static int set_rcvarray_entry(struct hfi1_filedata *fd,
18	struct tid_user_buf *tbuf,
19	u32 rcventry, struct tid_group *grp,
20	u16 pageidx, unsigned int npages);
21	static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
22	struct tid_rb_node *tnode);
23	static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
24	const struct mmu_notifier_range *range,
25	unsigned long cur_seq);
26	static bool tid_cover_invalidate(struct mmu_interval_notifier *mni,
27	const struct mmu_notifier_range *range,
28	unsigned long cur_seq);
29	static int program_rcvarray(struct hfi1_filedata fd, struct* tid_user_buf *,
30	struct tid_group *grp, u16 count,
31	u32 tidlist, unsigned* int *tididx,
32	unsigned int *pmapped);
33	static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo);
34	static void __clear_tid_node(struct hfi1_filedata *fd,
35	struct tid_rb_node *node);
36	static void clear_tid_node(struct hfi1_filedata fd, struct* tid_rb_node *node);
37
38	static const struct mmu_interval_notifier_ops tid_mn_ops = {
39	.invalidate = tid_rb_invalidate,
40	};
41	static const struct mmu_interval_notifier_ops tid_cover_ops = {
42	.invalidate = tid_cover_invalidate,
43	};
44
45	/*
46	* Initialize context and file private data needed for Expected
47	* receive caching. This needs to be done after the context has
48	* been configured with the eager/expected RcvEntry counts.
49	*/
50	int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
51	struct hfi1_ctxtdata *uctxt)
52	{
53	int ret = `0`;
54
55	fd->entry_to_rb = kcalloc(n: uctxt->expected_count,
56	size: sizeof(struct rb_node *),
57	GFP_KERNEL);
58	if (!fd->entry_to_rb)
59	return -ENOMEM;
60
61	if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) {
62	fd->invalid_tid_idx = `0`;
63	fd->invalid_tids = kcalloc(n: uctxt->expected_count,
64	size: sizeof(*fd->invalid_tids),
65	GFP_KERNEL);
66	if (!fd->invalid_tids) {
67	kfree(objp: fd->entry_to_rb);
68	fd->entry_to_rb = NULL;
69	return -ENOMEM;
70	}
71	fd->use_mn = true;
72	}
73
74	/*
75	* PSM does not have a good way to separate, count, and
76	* effectively enforce a limit on RcvArray entries used by
77	* subctxts (when context sharing is used) when TID caching
78	* is enabled. To help with that, we calculate a per-process
79	* RcvArray entry share and enforce that.
80	* If TID caching is not in use, PSM deals with usage on its
81	* own. In that case, we allow any subctxt to take all of the
82	* entries.
83	*
84	* Make sure that we set the tid counts only after successful
85	* init.
86	*/
87	spin_lock(lock: &fd->tid_lock);
88	if (uctxt->subctxt_cnt && fd->use_mn) {
89	u16 remainder;
90
91	fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
92	remainder = uctxt->expected_count % uctxt->subctxt_cnt;
93	if (remainder && fd->subctxt < remainder)
94	fd->tid_limit++;
95	} else {
96	fd->tid_limit = uctxt->expected_count;
97	}
98	spin_unlock(lock: &fd->tid_lock);
99
100	return ret;
101	}
102
103	void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
104	{
105	struct hfi1_ctxtdata *uctxt = fd->uctxt;
106
107	mutex_lock(&uctxt->exp_mutex);
108	if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
109	unlock_exp_tids(uctxt, set: &uctxt->tid_full_list, fd);
110	if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
111	unlock_exp_tids(uctxt, set: &uctxt->tid_used_list, fd);
112	mutex_unlock(lock: &uctxt->exp_mutex);
113
114	kfree(objp: fd->invalid_tids);
115	fd->invalid_tids = NULL;
116
117	kfree(objp: fd->entry_to_rb);
118	fd->entry_to_rb = NULL;
119	}
120
121	/*
122	* Release pinned receive buffer pages.
123	*
124	* @mapped: true if the pages have been DMA mapped. false otherwise.
125	* @idx: Index of the first page to unpin.
126	* @npages: No of pages to unpin.
127	*
128	* If the pages have been DMA mapped (indicated by mapped parameter), their
129	* info will be passed via a struct tid_rb_node. If they haven't been mapped,
130	* their info will be passed via a struct tid_user_buf.
131	*/
132	static void unpin_rcv_pages(struct hfi1_filedata *fd,
133	struct tid_user_buf *tidbuf,
134	struct tid_rb_node *node,
135	unsigned int idx,
136	unsigned int npages,
137	bool mapped)
138	{
139	struct page **pages;
140	struct hfi1_devdata *dd = fd->uctxt->dd;
141	struct mm_struct *mm;
142
143	if (mapped) {
144	dma_unmap_single(&dd->pcidev->dev, node->dma_addr,
145	node->npages * PAGE_SIZE, DMA_FROM_DEVICE);
146	pages = &node->pages[idx];
147	mm = mm_from_tid_node(node);
148	} else {
149	pages = &tidbuf->pages[idx];
150	mm = current->mm;
151	}
152	hfi1_release_user_pages(mm, p: pages, npages, dirty: mapped);
153	fd->tid_n_pinned -= npages;
154	}
155
156	/*
157	* Pin receive buffer pages.
158	*/
159	static int pin_rcv_pages(struct hfi1_filedata fd, struct* tid_user_buf *tidbuf)
160	{
161	int pinned;
162	unsigned int npages = tidbuf->npages;
163	unsigned long vaddr = tidbuf->vaddr;
164	struct page **pages = NULL;
165	struct hfi1_devdata *dd = fd->uctxt->dd;
166
167	if (npages > fd->uctxt->expected_count) {
168	dd_dev_err(dd, "Expected buffer too big\n");
169	return -EINVAL;
170	}
171
172	/ Allocate the array of struct page pointers needed for pinning /
173	pages = kcalloc(n: npages, size: sizeof(*pages), GFP_KERNEL);
174	if (!pages)
175	return -ENOMEM;
176
177	/*
178	* Pin all the pages of the user buffer. If we can't pin all the
179	* pages, accept the amount pinned so far and program only that.
180	* User space knows how to deal with partially programmed buffers.
181	*/
182	if (!hfi1_can_pin_pages(dd, current->mm, nlocked: fd->tid_n_pinned, npages)) {
183	kfree(objp: pages);
184	return -ENOMEM;
185	}
186
187	pinned = hfi1_acquire_user_pages(current->mm, vaddr, npages, writable: true, pages);
188	if (pinned <= `0`) {
189	kfree(objp: pages);
190	return pinned;
191	}
192	tidbuf->pages = pages;
193	fd->tid_n_pinned += pinned;
194	return pinned;
195	}
196
197	/*
198	* RcvArray entry allocation for Expected Receives is done by the
199	* following algorithm:
200	*
201	* The context keeps 3 lists of groups of RcvArray entries:
202	* 1. List of empty groups - tid_group_list
203	* This list is created during user context creation and
204	* contains elements which describe sets (of 8) of empty
205	* RcvArray entries.
206	* 2. List of partially used groups - tid_used_list
207	* This list contains sets of RcvArray entries which are
208	* not completely used up. Another mapping request could
209	* use some of all of the remaining entries.
210	* 3. List of full groups - tid_full_list
211	* This is the list where sets that are completely used
212	* up go.
213	*
214	* An attempt to optimize the usage of RcvArray entries is
215	* made by finding all sets of physically contiguous pages in a
216	* user's buffer.
217	* These physically contiguous sets are further split into
218	* sizes supported by the receive engine of the HFI. The
219	* resulting sets of pages are stored in struct tid_pageset,
220	* which describes the sets as:
221	* * .count - number of pages in this set
222	* * .idx - starting index into struct page ** array
223	* of this set
224	*
225	* From this point on, the algorithm deals with the page sets
226	* described above. The number of pagesets is divided by the
227	* RcvArray group size to produce the number of full groups
228	* needed.
229	*
230	* Groups from the 3 lists are manipulated using the following
231	* rules:
232	* 1. For each set of 8 pagesets, a complete group from
233	* tid_group_list is taken, programmed, and moved to
234	* the tid_full_list list.
235	* 2. For all remaining pagesets:
236	* 2.1 If the tid_used_list is empty and the tid_group_list
237	* is empty, stop processing pageset and return only
238	* what has been programmed up to this point.
239	* 2.2 If the tid_used_list is empty and the tid_group_list
240	* is not empty, move a group from tid_group_list to
241	* tid_used_list.
242	* 2.3 For each group is tid_used_group, program as much as
243	* can fit into the group. If the group becomes fully
244	* used, move it to tid_full_list.
245	*/
246	int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
247	struct hfi1_tid_info *tinfo)
248	{
249	int ret = `0`, need_group = `0`, pinned;
250	struct hfi1_ctxtdata *uctxt = fd->uctxt;
251	struct hfi1_devdata *dd = uctxt->dd;
252	unsigned int ngroups, pageset_count,
253	tididx = `0`, mapped, mapped_pages = `0`;
254	u32 *tidlist = NULL;
255	struct tid_user_buf *tidbuf;
256	unsigned long mmu_seq = `0`;
257
258	if (!PAGE_ALIGNED(tinfo->vaddr))
259	return -EINVAL;
260	if (tinfo->length == `0`)
261	return -EINVAL;
262
263	tidbuf = kzalloc(size: sizeof(*tidbuf), GFP_KERNEL);
264	if (!tidbuf)
265	return -ENOMEM;
266
267	mutex_init(&tidbuf->cover_mutex);
268	tidbuf->vaddr = tinfo->vaddr;
269	tidbuf->length = tinfo->length;
270	tidbuf->npages = num_user_pages(addr: tidbuf->vaddr, len: tidbuf->length);
271	tidbuf->psets = kcalloc(n: uctxt->expected_count, size: sizeof(*tidbuf->psets),
272	GFP_KERNEL);
273	if (!tidbuf->psets) {
274	ret = -ENOMEM;
275	goto fail_release_mem;
276	}
277
278	if (fd->use_mn) {
279	ret = mmu_interval_notifier_insert(
280	interval_sub: &tidbuf->notifier, current->mm,
281	start: tidbuf->vaddr, length: tidbuf->npages * PAGE_SIZE,
282	ops: &tid_cover_ops);
283	if (ret)
284	goto fail_release_mem;
285	mmu_seq = mmu_interval_read_begin(interval_sub: &tidbuf->notifier);
286	}
287
288	pinned = pin_rcv_pages(fd, tidbuf);
289	if (pinned <= `0`) {
290	ret = (pinned < `0`) ? pinned : -ENOSPC;
291	goto fail_unpin;
292	}
293
294	/ Find sets of physically contiguous pages /
295	tidbuf->n_psets = find_phys_blocks(tidbuf, npages: pinned);
296
297	/ Reserve the number of expected tids to be used. /
298	spin_lock(lock: &fd->tid_lock);
299	if (fd->tid_used + tidbuf->n_psets > fd->tid_limit)
300	pageset_count = fd->tid_limit - fd->tid_used;
301	else
302	pageset_count = tidbuf->n_psets;
303	fd->tid_used += pageset_count;
304	spin_unlock(lock: &fd->tid_lock);
305
306	if (!pageset_count) {
307	ret = -ENOSPC;
308	goto fail_unreserve;
309	}
310
311	ngroups = pageset_count / dd->rcv_entries.group_size;
312	tidlist = kcalloc(n: pageset_count, size: sizeof(*tidlist), GFP_KERNEL);
313	if (!tidlist) {
314	ret = -ENOMEM;
315	goto fail_unreserve;
316	}
317
318	tididx = `0`;
319
320	/*
321	* From this point on, we are going to be using shared (between master
322	* and subcontexts) context resources. We need to take the lock.
323	*/
324	mutex_lock(&uctxt->exp_mutex);
325	/*
326	* The first step is to program the RcvArray entries which are complete
327	* groups.
328	*/
329	while (ngroups && uctxt->tid_group_list.count) {
330	struct tid_group *grp =
331	tid_group_pop(set: &uctxt->tid_group_list);
332
333	ret = program_rcvarray(fd, tidbuf, grp,
334	count: dd->rcv_entries.group_size,
335	tidlist, tididx: &tididx, pmapped: &mapped);
336	/*
337	* If there was a failure to program the RcvArray
338	* entries for the entire group, reset the grp fields
339	* and add the grp back to the free group list.
340	*/
341	if (ret <= `0`) {
342	tid_group_add_tail(grp, set: &uctxt->tid_group_list);
343	hfi1_cdbg(TID,
344	"Failed to program RcvArray group %d", ret);
345	goto unlock;
346	}
347
348	tid_group_add_tail(grp, set: &uctxt->tid_full_list);
349	ngroups--;
350	mapped_pages += mapped;
351	}
352
353	while (tididx < pageset_count) {
354	struct tid_group grp, ptr;
355	/*
356	* If we don't have any partially used tid groups, check
357	* if we have empty groups. If so, take one from there and
358	* put in the partially used list.
359	*/
360	if (!uctxt->tid_used_list.count \|\| need_group) {
361	if (!uctxt->tid_group_list.count)
362	goto unlock;
363
364	grp = tid_group_pop(set: &uctxt->tid_group_list);
365	tid_group_add_tail(grp, set: &uctxt->tid_used_list);
366	need_group = `0`;
367	}
368	/*
369	* There is an optimization opportunity here - instead of
370	* fitting as many page sets as we can, check for a group
371	* later on in the list that could fit all of them.
372	*/
373	list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
374	list) {
375	unsigned use = min_t(unsigned, pageset_count - tididx,
376	grp->size - grp->used);
377
378	ret = program_rcvarray(fd, tidbuf, grp,
379	count: use, tidlist,
380	tididx: &tididx, pmapped: &mapped);
381	if (ret < `0`) {
382	hfi1_cdbg(TID,
383	"Failed to program RcvArray entries %d",
384	ret);
385	goto unlock;
386	} else if (ret > `0`) {
387	if (grp->used == grp->size)
388	tid_group_move(group: grp,
389	s1: &uctxt->tid_used_list,
390	s2: &uctxt->tid_full_list);
391	mapped_pages += mapped;
392	need_group = `0`;
393	/ Check if we are done so we break out early /
394	if (tididx >= pageset_count)
395	break;
396	} else if (WARN_ON(ret == `0`)) {
397	/*
398	* If ret is 0, we did not program any entries
399	* into this group, which can only happen if
400	* we've screwed up the accounting somewhere.
401	* Warn and try to continue.
402	*/
403	need_group = `1`;
404	}
405	}
406	}
407	unlock:
408	mutex_unlock(lock: &uctxt->exp_mutex);
409	hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
410	mapped_pages, ret);
411
412	/ fail if nothing was programmed, set error if none provided /
413	if (tididx == `0`) {
414	if (ret >= `0`)
415	ret = -ENOSPC;
416	goto fail_unreserve;
417	}
418
419	/ adjust reserved tid_used to actual count /
420	spin_lock(lock: &fd->tid_lock);
421	fd->tid_used -= pageset_count - tididx;
422	spin_unlock(lock: &fd->tid_lock);
423
424	/ unpin all pages not covered by a TID /
425	unpin_rcv_pages(fd, tidbuf, NULL, idx: mapped_pages, npages: pinned - mapped_pages,
426	mapped: false);
427
428	if (fd->use_mn) {
429	/ check for an invalidate during setup /
430	bool fail = false;
431
432	mutex_lock(&tidbuf->cover_mutex);
433	fail = mmu_interval_read_retry(interval_sub: &tidbuf->notifier, seq: mmu_seq);
434	mutex_unlock(lock: &tidbuf->cover_mutex);
435
436	if (fail) {
437	ret = -EBUSY;
438	goto fail_unprogram;
439	}
440	}
441
442	tinfo->tidcnt = tididx;
443	tinfo->length = mapped_pages * PAGE_SIZE;
444
445	if (copy_to_user(u64_to_user_ptr(tinfo->tidlist),
446	from: tidlist, n: sizeof(tidlist[`0`]) * tididx)) {
447	ret = -EFAULT;
448	goto fail_unprogram;
449	}
450
451	if (fd->use_mn)
452	mmu_interval_notifier_remove(interval_sub: &tidbuf->notifier);
453	kfree(objp: tidbuf->pages);
454	kfree(objp: tidbuf->psets);
455	kfree(objp: tidbuf);
456	kfree(objp: tidlist);
457	return `0`;
458
459	fail_unprogram:
460	/ unprogram, unmap, and unpin all allocated TIDs /
461	tinfo->tidlist = (unsigned long)tidlist;
462	hfi1_user_exp_rcv_clear(fd, tinfo);
463	tinfo->tidlist = `0`;
464	pinned = `0`; / nothing left to unpin /
465	pageset_count = `0`; / nothing left reserved /
466	fail_unreserve:
467	spin_lock(lock: &fd->tid_lock);
468	fd->tid_used -= pageset_count;
469	spin_unlock(lock: &fd->tid_lock);
470	fail_unpin:
471	if (fd->use_mn)
472	mmu_interval_notifier_remove(interval_sub: &tidbuf->notifier);
473	if (pinned > `0`)
474	unpin_rcv_pages(fd, tidbuf, NULL, idx: `0`, npages: pinned, mapped: false);
475	fail_release_mem:
476	kfree(objp: tidbuf->pages);
477	kfree(objp: tidbuf->psets);
478	kfree(objp: tidbuf);
479	kfree(objp: tidlist);
480	return ret;
481	}
482
483	int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
484	struct hfi1_tid_info *tinfo)
485	{
486	int ret = `0`;
487	struct hfi1_ctxtdata *uctxt = fd->uctxt;
488	u32 *tidinfo;
489	unsigned tididx;
490
491	if (unlikely(tinfo->tidcnt > fd->tid_used))
492	return -EINVAL;
493
494	tidinfo = memdup_array_user(u64_to_user_ptr(tinfo->tidlist),
495	n: tinfo->tidcnt, size: sizeof(tidinfo[`0`]));
496	if (IS_ERR(ptr: tidinfo))
497	return PTR_ERR(ptr: tidinfo);
498
499	mutex_lock(&uctxt->exp_mutex);
500	for (tididx = `0`; tididx < tinfo->tidcnt; tididx++) {
501	ret = unprogram_rcvarray(fd, tidinfo: tidinfo[tididx]);
502	if (ret) {
503	hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
504	ret);
505	break;
506	}
507	}
508	spin_lock(lock: &fd->tid_lock);
509	fd->tid_used -= tididx;
510	spin_unlock(lock: &fd->tid_lock);
511	tinfo->tidcnt = tididx;
512	mutex_unlock(lock: &uctxt->exp_mutex);
513
514	kfree(objp: tidinfo);
515	return ret;
516	}
517
518	int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd,
519	struct hfi1_tid_info *tinfo)
520	{
521	struct hfi1_ctxtdata *uctxt = fd->uctxt;
522	unsigned long *ev = uctxt->dd->events +
523	(uctxt_offset(uctxt) + fd->subctxt);
524	u32 *array;
525	int ret = `0`;
526
527	/*
528	* copy_to_user() can sleep, which will leave the invalid_lock
529	* locked and cause the MMU notifier to be blocked on the lock
530	* for a long time.
531	* Copy the data to a local buffer so we can release the lock.
532	*/
533	array = kcalloc(n: uctxt->expected_count, size: sizeof(*array), GFP_KERNEL);
534	if (!array)
535	return -EFAULT;
536
537	spin_lock(lock: &fd->invalid_lock);
538	if (fd->invalid_tid_idx) {
539	memcpy(array, fd->invalid_tids, sizeof(array)
540	fd->invalid_tid_idx);
541	memset(fd->invalid_tids, `0`, sizeof(fd->invalid_tids)
542	fd->invalid_tid_idx);
543	tinfo->tidcnt = fd->invalid_tid_idx;
544	fd->invalid_tid_idx = `0`;
545	/*
546	* Reset the user flag while still holding the lock.
547	* Otherwise, PSM can miss events.
548	*/
549	clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, addr: ev);
550	} else {
551	tinfo->tidcnt = `0`;
552	}
553	spin_unlock(lock: &fd->invalid_lock);
554
555	if (tinfo->tidcnt) {
556	if (copy_to_user(to: (void __user *)tinfo->tidlist,
557	from: array, n: sizeof(array) tinfo->tidcnt))
558	ret = -EFAULT;
559	}
560	kfree(objp: array);
561
562	return ret;
563	}
564
565	static u32 find_phys_blocks(struct tid_user_buf tidbuf, unsigned* int npages)
566	{
567	unsigned pagecount, pageidx, setcount = `0`, i;
568	unsigned long pfn, this_pfn;
569	struct page **pages = tidbuf->pages;
570	struct tid_pageset *list = tidbuf->psets;
571
572	if (!npages)
573	return `0`;
574
575	/*
576	* Look for sets of physically contiguous pages in the user buffer.
577	* This will allow us to optimize Expected RcvArray entry usage by
578	* using the bigger supported sizes.
579	*/
580	pfn = page_to_pfn(pages[`0`]);
581	for (pageidx = `0`, pagecount = `1`, i = `1`; i <= npages; i++) {
582	this_pfn = i < npages ? page_to_pfn(pages[i]) : `0`;
583
584	/*
585	* If the pfn's are not sequential, pages are not physically
586	* contiguous.
587	*/
588	if (this_pfn != ++pfn) {
589	/*
590	* At this point we have to loop over the set of
591	* physically contiguous pages and break them down it
592	* sizes supported by the HW.
593	* There are two main constraints:
594	* 1. The max buffer size is MAX_EXPECTED_BUFFER.
595	* If the total set size is bigger than that
596	* program only a MAX_EXPECTED_BUFFER chunk.
597	* 2. The buffer size has to be a power of two. If
598	* it is not, round down to the closes power of
599	* 2 and program that size.
600	*/
601	while (pagecount) {
602	int maxpages = pagecount;
603	u32 bufsize = pagecount * PAGE_SIZE;
604
605	if (bufsize > MAX_EXPECTED_BUFFER)
606	maxpages =
607	MAX_EXPECTED_BUFFER >>
608	PAGE_SHIFT;
609	else if (!is_power_of_2(n: bufsize))
610	maxpages =
611	rounddown_pow_of_two(bufsize) >>
612	PAGE_SHIFT;
613
614	list[setcount].idx = pageidx;
615	list[setcount].count = maxpages;
616	pagecount -= maxpages;
617	pageidx += maxpages;
618	setcount++;
619	}
620	pageidx = i;
621	pagecount = `1`;
622	pfn = this_pfn;
623	} else {
624	pagecount++;
625	}
626	}
627	return setcount;
628	}
629
630	/**
631	* program_rcvarray() - program an RcvArray group with receive buffers
632	* @fd: filedata pointer
633	* @tbuf: pointer to struct tid_user_buf that has the user buffer starting
634	* virtual address, buffer length, page pointers, pagesets (array of
635	* struct tid_pageset holding information on physically contiguous
636	* chunks from the user buffer), and other fields.
637	* @grp: RcvArray group
638	* @count: number of struct tid_pageset's to program
639	* @tidlist: the array of u32 elements when the information about the
640	* programmed RcvArray entries is to be encoded.
641	* @tididx: starting offset into tidlist
642	* @pmapped: (output parameter) number of pages programmed into the RcvArray
643	* entries.
644	*
645	* This function will program up to 'count' number of RcvArray entries from the
646	* group 'grp'. To make best use of write-combining writes, the function will
647	* perform writes to the unused RcvArray entries which will be ignored by the
648	* HW. Each RcvArray entry will be programmed with a physically contiguous
649	* buffer chunk from the user's virtual buffer.
650	*
651	* Return:
652	* -EINVAL if the requested count is larger than the size of the group,
653	* -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or
654	* number of RcvArray entries programmed.
655	*/
656	static int program_rcvarray(struct hfi1_filedata fd, struct* tid_user_buf *tbuf,
657	struct tid_group *grp, u16 count,
658	u32 tidlist, unsigned* int *tididx,
659	unsigned int *pmapped)
660	{
661	struct hfi1_ctxtdata *uctxt = fd->uctxt;
662	struct hfi1_devdata *dd = uctxt->dd;
663	u16 idx;
664	unsigned int start = *tididx;
665	u32 tidinfo = `0`, rcventry, useidx = `0`;
666	int mapped = `0`;
667
668	/ Count should never be larger than the group size /
669	if (count > grp->size)
670	return -EINVAL;
671
672	/ Find the first unused entry in the group /
673	for (idx = `0`; idx < grp->size; idx++) {
674	if (!(grp->map & (`1` << idx))) {
675	useidx = idx;
676	break;
677	}
678	rcv_array_wc_fill(dd, index: grp->base + idx);
679	}
680
681	idx = `0`;
682	while (idx < count) {
683	u16 npages, pageidx, setidx = start + idx;
684	int ret = `0`;
685
686	/*
687	* If this entry in the group is used, move to the next one.
688	* If we go past the end of the group, exit the loop.
689	*/
690	if (useidx >= grp->size) {
691	break;
692	} else if (grp->map & (`1` << useidx)) {
693	rcv_array_wc_fill(dd, index: grp->base + useidx);
694	useidx++;
695	continue;
696	}
697
698	rcventry = grp->base + useidx;
699	npages = tbuf->psets[setidx].count;
700	pageidx = tbuf->psets[setidx].idx;
701
702	ret = set_rcvarray_entry(fd, tbuf,
703	rcventry, grp, pageidx,
704	npages);
705	if (ret)
706	return ret;
707	mapped += npages;
708
709	tidinfo = create_tid(rcventry: rcventry - uctxt->expected_base, npages);
710	tidlist[(*tididx)++] = tidinfo;
711	grp->used++;
712	grp->map \|= `1` << useidx++;
713	idx++;
714	}
715
716	/ Fill the rest of the group with "blank" writes /
717	for (; useidx < grp->size; useidx++)
718	rcv_array_wc_fill(dd, index: grp->base + useidx);
719	*pmapped = mapped;
720	return idx;
721	}
722
723	static int set_rcvarray_entry(struct hfi1_filedata *fd,
724	struct tid_user_buf *tbuf,
725	u32 rcventry, struct tid_group *grp,
726	u16 pageidx, unsigned int npages)
727	{
728	int ret;
729	struct hfi1_ctxtdata *uctxt = fd->uctxt;
730	struct tid_rb_node *node;
731	struct hfi1_devdata *dd = uctxt->dd;
732	dma_addr_t phys;
733	struct page **pages = tbuf->pages + pageidx;
734
735	/*
736	* Allocate the node first so we can handle a potential
737	* failure before we've programmed anything.
738	*/
739	node = kzalloc(struct_size(node, pages, npages), GFP_KERNEL);
740	if (!node)
741	return -ENOMEM;
742
743	phys = dma_map_single(&dd->pcidev->dev, __va(page_to_phys(pages[`0`])),
744	npages * PAGE_SIZE, DMA_FROM_DEVICE);
745	if (dma_mapping_error(dev: &dd->pcidev->dev, dma_addr: phys)) {
746	dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n",
747	phys);
748	kfree(objp: node);
749	return -EFAULT;
750	}
751
752	node->fdata = fd;
753	mutex_init(&node->invalidate_mutex);
754	node->phys = page_to_phys(pages[`0`]);
755	node->npages = npages;
756	node->rcventry = rcventry;
757	node->dma_addr = phys;
758	node->grp = grp;
759	node->freed = false;
760	memcpy(node->pages, pages, flex_array_size(node, pages, npages));
761
762	if (fd->use_mn) {
763	ret = mmu_interval_notifier_insert(
764	interval_sub: &node->notifier, current->mm,
765	start: tbuf->vaddr + (pageidx * PAGE_SIZE), length: npages * PAGE_SIZE,
766	ops: &tid_mn_ops);
767	if (ret)
768	goto out_unmap;
769	}
770	fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node;
771
772	hfi1_put_tid(dd, index: rcventry, PT_EXPECTED, pa: phys, ilog2(npages) + `1`);
773	trace_hfi1_exp_tid_reg(ctxt: uctxt->ctxt, subctxt: fd->subctxt, rarr: rcventry, npages,
774	va: node->notifier.interval_tree.start, pa: node->phys,
775	dma: phys);
776	return `0`;
777
778	out_unmap:
779	hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
780	node->rcventry, node->notifier.interval_tree.start,
781	node->phys, ret);
782	dma_unmap_single(&dd->pcidev->dev, phys, npages * PAGE_SIZE,
783	DMA_FROM_DEVICE);
784	kfree(objp: node);
785	return -EFAULT;
786	}
787
788	static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo)
789	{
790	struct hfi1_ctxtdata *uctxt = fd->uctxt;
791	struct hfi1_devdata *dd = uctxt->dd;
792	struct tid_rb_node *node;
793	u32 tidctrl = EXP_TID_GET(tidinfo, CTRL);
794	u32 tididx = EXP_TID_GET(tidinfo, IDX) << `1`, rcventry;
795
796	if (tidctrl == `0x3` \|\| tidctrl == `0x0`)
797	return -EINVAL;
798
799	rcventry = tididx + (tidctrl - `1`);
800
801	if (rcventry >= uctxt->expected_count) {
802	dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
803	rcventry, uctxt->ctxt);
804	return -EINVAL;
805	}
806
807	node = fd->entry_to_rb[rcventry];
808	if (!node \|\| node->rcventry != (uctxt->expected_base + rcventry))
809	return -EBADF;
810
811	if (fd->use_mn)
812	mmu_interval_notifier_remove(interval_sub: &node->notifier);
813	cacheless_tid_rb_remove(fdata: fd, tnode: node);
814
815	return `0`;
816	}
817
818	static void __clear_tid_node(struct hfi1_filedata fd, struct* tid_rb_node *node)
819	{
820	struct hfi1_ctxtdata *uctxt = fd->uctxt;
821	struct hfi1_devdata *dd = uctxt->dd;
822
823	mutex_lock(&node->invalidate_mutex);
824	if (node->freed)
825	goto done;
826	node->freed = true;
827
828	trace_hfi1_exp_tid_unreg(ctxt: uctxt->ctxt, subctxt: fd->subctxt, rarr: node->rcventry,
829	npages: node->npages,
830	va: node->notifier.interval_tree.start, pa: node->phys,
831	dma: node->dma_addr);
832
833	/ Make sure device has seen the write before pages are unpinned /
834	hfi1_put_tid(dd, index: node->rcventry, PT_INVALID_FLUSH, pa: `0`, order: `0`);
835
836	unpin_rcv_pages(fd, NULL, node, idx: `0`, npages: node->npages, mapped: true);
837	done:
838	mutex_unlock(lock: &node->invalidate_mutex);
839	}
840
841	static void clear_tid_node(struct hfi1_filedata fd, struct* tid_rb_node *node)
842	{
843	struct hfi1_ctxtdata *uctxt = fd->uctxt;
844
845	__clear_tid_node(fd, node);
846
847	node->grp->used--;
848	node->grp->map &= ~(`1` << (node->rcventry - node->grp->base));
849
850	if (node->grp->used == node->grp->size - `1`)
851	tid_group_move(group: node->grp, s1: &uctxt->tid_full_list,
852	s2: &uctxt->tid_used_list);
853	else if (!node->grp->used)
854	tid_group_move(group: node->grp, s1: &uctxt->tid_used_list,
855	s2: &uctxt->tid_group_list);
856	kfree(objp: node);
857	}
858
859	/*
860	* As a simple helper for hfi1_user_exp_rcv_free, this function deals with
861	* clearing nodes in the non-cached case.
862	*/
863	static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
864	struct exp_tid_set *set,
865	struct hfi1_filedata *fd)
866	{
867	struct tid_group grp, ptr;
868	int i;
869
870	list_for_each_entry_safe(grp, ptr, &set->list, list) {
871	list_del_init(entry: &grp->list);
872
873	for (i = `0`; i < grp->size; i++) {
874	if (grp->map & (`1` << i)) {
875	u16 rcventry = grp->base + i;
876	struct tid_rb_node *node;
877
878	node = fd->entry_to_rb[rcventry -
879	uctxt->expected_base];
880	if (!node \|\| node->rcventry != rcventry)
881	continue;
882
883	if (fd->use_mn)
884	mmu_interval_notifier_remove(
885	interval_sub: &node->notifier);
886	cacheless_tid_rb_remove(fdata: fd, tnode: node);
887	}
888	}
889	}
890	}
891
892	static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
893	const struct mmu_notifier_range *range,
894	unsigned long cur_seq)
895	{
896	struct tid_rb_node *node =
897	container_of(mni, struct tid_rb_node, notifier);
898	struct hfi1_filedata *fdata = node->fdata;
899	struct hfi1_ctxtdata *uctxt = fdata->uctxt;
900
901	if (node->freed)
902	return true;
903
904	/ take action only if unmapping /
905	if (range->event != MMU_NOTIFY_UNMAP)
906	return true;
907
908	trace_hfi1_exp_tid_inval(ctxt: uctxt->ctxt, subctxt: fdata->subctxt,
909	va: node->notifier.interval_tree.start,
910	rarr: node->rcventry, npages: node->npages, dma: node->dma_addr);
911
912	/ clear the hardware rcvarray entry /
913	__clear_tid_node(fd: fdata, node);
914
915	spin_lock(lock: &fdata->invalid_lock);
916	if (fdata->invalid_tid_idx < uctxt->expected_count) {
917	fdata->invalid_tids[fdata->invalid_tid_idx] =
918	create_tid(rcventry: node->rcventry - uctxt->expected_base,
919	npages: node->npages);
920	if (!fdata->invalid_tid_idx) {
921	unsigned long *ev;
922
923	/*
924	* hfi1_set_uevent_bits() sets a user event flag
925	* for all processes. Because calling into the
926	* driver to process TID cache invalidations is
927	* expensive and TID cache invalidations are
928	* handled on a per-process basis, we can
929	* optimize this to set the flag only for the
930	* process in question.
931	*/
932	ev = uctxt->dd->events +
933	(uctxt_offset(uctxt) + fdata->subctxt);
934	set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, addr: ev);
935	}
936	fdata->invalid_tid_idx++;
937	}
938	spin_unlock(lock: &fdata->invalid_lock);
939	return true;
940	}
941
942	static bool tid_cover_invalidate(struct mmu_interval_notifier *mni,
943	const struct mmu_notifier_range *range,
944	unsigned long cur_seq)
945	{
946	struct tid_user_buf *tidbuf =
947	container_of(mni, struct tid_user_buf, notifier);
948
949	/ take action only if unmapping /
950	if (range->event == MMU_NOTIFY_UNMAP) {
951	mutex_lock(&tidbuf->cover_mutex);
952	mmu_interval_set_seq(interval_sub: mni, cur_seq);
953	mutex_unlock(lock: &tidbuf->cover_mutex);
954	}
955
956	return true;
957	}
958
959	static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
960	struct tid_rb_node *tnode)
961	{
962	u32 base = fdata->uctxt->expected_base;
963
964	fdata->entry_to_rb[tnode->rcventry - base] = NULL;
965	clear_tid_node(fd: fdata, node: tnode);
966	}
967

source code of linux/drivers/infiniband/hw/hfi1/user_exp_rcv.c