tid_rdma.c source code [linux/drivers/infiniband/hw/hfi1/tid_rdma.c]

1	// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
2	/*
3	* Copyright(c) 2018 - 2020 Intel Corporation.
4	*
5	*/
6
7	#include "hfi.h"
8	#include "qp.h"
9	#include "rc.h"
10	#include "verbs.h"
11	#include "tid_rdma.h"
12	#include "exp_rcv.h"
13	#include "trace.h"
14
15	/**
16	* DOC: TID RDMA READ protocol
17	*
18	* This is an end-to-end protocol at the hfi1 level between two nodes that
19	* improves performance by avoiding data copy on the requester side. It
20	* converts a qualified RDMA READ request into a TID RDMA READ request on
21	* the requester side and thereafter handles the request and response
22	* differently. To be qualified, the RDMA READ request should meet the
23	* following:
24	* -- The total data length should be greater than 256K;
25	* -- The total data length should be a multiple of 4K page size;
26	* -- Each local scatter-gather entry should be 4K page aligned;
27	* -- Each local scatter-gather entry should be a multiple of 4K page size;
28	*/
29
30	#define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32)
31	#define RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK BIT_ULL(33)
32	#define RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK BIT_ULL(34)
33	#define RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK BIT_ULL(35)
34	#define RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK BIT_ULL(37)
35	#define RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK BIT_ULL(38)
36
37	/ Maximum number of packets within a flow generation. /
38	#define MAX_TID_FLOW_PSN BIT(HFI1_KDETH_BTH_SEQ_SHIFT)
39
40	#define GENERATION_MASK 0xFFFFF
41
42	static u32 mask_generation(u32 a)
43	{
44	return a & GENERATION_MASK;
45	}
46
47	/ Reserved generation value to set to unused flows for kernel contexts /
48	#define KERN_GENERATION_RESERVED mask_generation(U32_MAX)
49
50	/*
51	* J_KEY for kernel contexts when TID RDMA is used.
52	* See generate_jkey() in hfi.h for more information.
53	*/
54	#define TID_RDMA_JKEY 32
55	#define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE
56	#define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1)
57
58	/ Maximum number of segments in flight per QP request. /
59	#define TID_RDMA_MAX_READ_SEGS_PER_REQ 6
60	#define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4
61	#define MAX_REQ max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ, \
62	TID_RDMA_MAX_WRITE_SEGS_PER_REQ)
63	#define MAX_FLOWS roundup_pow_of_two(MAX_REQ + 1)
64
65	#define MAX_EXPECTED_PAGES (MAX_EXPECTED_BUFFER / PAGE_SIZE)
66
67	#define TID_RDMA_DESTQP_FLOW_SHIFT 11
68	#define TID_RDMA_DESTQP_FLOW_MASK 0x1f
69
70	#define TID_OPFN_QP_CTXT_MASK 0xff
71	#define TID_OPFN_QP_CTXT_SHIFT 56
72	#define TID_OPFN_QP_KDETH_MASK 0xff
73	#define TID_OPFN_QP_KDETH_SHIFT 48
74	#define TID_OPFN_MAX_LEN_MASK 0x7ff
75	#define TID_OPFN_MAX_LEN_SHIFT 37
76	#define TID_OPFN_TIMEOUT_MASK 0x1f
77	#define TID_OPFN_TIMEOUT_SHIFT 32
78	#define TID_OPFN_RESERVED_MASK 0x3f
79	#define TID_OPFN_RESERVED_SHIFT 26
80	#define TID_OPFN_URG_MASK 0x1
81	#define TID_OPFN_URG_SHIFT 25
82	#define TID_OPFN_VER_MASK 0x7
83	#define TID_OPFN_VER_SHIFT 22
84	#define TID_OPFN_JKEY_MASK 0x3f
85	#define TID_OPFN_JKEY_SHIFT 16
86	#define TID_OPFN_MAX_READ_MASK 0x3f
87	#define TID_OPFN_MAX_READ_SHIFT 10
88	#define TID_OPFN_MAX_WRITE_MASK 0x3f
89	#define TID_OPFN_MAX_WRITE_SHIFT 4
90
91	/*
92	* OPFN TID layout
93	*
94	* 63 47 31 15
95	* NNNNNNNNKKKKKKKK MMMMMMMMMMMTTTTT DDDDDDUVVVJJJJJJ RRRRRRWWWWWWCCCC
96	* 3210987654321098 7654321098765432 1098765432109876 5432109876543210
97	* N - the context Number
98	* K - the Kdeth_qp
99	* M - Max_len
100	* T - Timeout
101	* D - reserveD
102	* V - version
103	* U - Urg capable
104	* J - Jkey
105	* R - max_Read
106	* W - max_Write
107	* C - Capcode
108	*/
109
110	static void tid_rdma_trigger_resume(struct work_struct *work);
111	static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req);
112	static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
113	gfp_t gfp);
114	static void hfi1_init_trdma_req(struct rvt_qp *qp,
115	struct tid_rdma_request *req);
116	static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx);
117	static void hfi1_tid_timeout(struct timer_list *t);
118	static void hfi1_add_tid_reap_timer(struct rvt_qp *qp);
119	static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp);
120	static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp);
121	static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp);
122	static void hfi1_tid_retry_timeout(struct timer_list *t);
123	static int make_tid_rdma_ack(struct rvt_qp *qp,
124	struct ib_other_headers *ohdr,
125	struct hfi1_pkt_state *ps);
126	static void hfi1_do_tid_send(struct rvt_qp *qp);
127	static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx);
128	static void tid_rdma_rcv_err(struct hfi1_packet *packet,
129	struct ib_other_headers *ohdr,
130	struct rvt_qp qp, u32 psn, int* diff, bool fecn);
131	static void update_r_next_psn_fecn(struct hfi1_packet *packet,
132	struct hfi1_qp_priv *priv,
133	struct hfi1_ctxtdata *rcd,
134	struct tid_rdma_flow *flow,
135	bool fecn);
136
137	static void validate_r_tid_ack(struct hfi1_qp_priv *priv)
138	{
139	if (priv->r_tid_ack == HFI1_QP_WQE_INVALID)
140	priv->r_tid_ack = priv->r_tid_tail;
141	}
142
143	static void tid_rdma_schedule_ack(struct rvt_qp *qp)
144	{
145	struct hfi1_qp_priv *priv = qp->priv;
146
147	priv->s_flags \|= RVT_S_ACK_PENDING;
148	hfi1_schedule_tid_send(qp);
149	}
150
151	static void tid_rdma_trigger_ack(struct rvt_qp *qp)
152	{
153	validate_r_tid_ack(priv: qp->priv);
154	tid_rdma_schedule_ack(qp);
155	}
156
157	static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p)
158	{
159	return
160	(((u64)p->qp & TID_OPFN_QP_CTXT_MASK) <<
161	TID_OPFN_QP_CTXT_SHIFT) \|
162	((((u64)p->qp >> `16`) & TID_OPFN_QP_KDETH_MASK) <<
163	TID_OPFN_QP_KDETH_SHIFT) \|
164	(((u64)((p->max_len >> PAGE_SHIFT) - `1`) &
165	TID_OPFN_MAX_LEN_MASK) << TID_OPFN_MAX_LEN_SHIFT) \|
166	(((u64)p->timeout & TID_OPFN_TIMEOUT_MASK) <<
167	TID_OPFN_TIMEOUT_SHIFT) \|
168	(((u64)p->urg & TID_OPFN_URG_MASK) << TID_OPFN_URG_SHIFT) \|
169	(((u64)p->jkey & TID_OPFN_JKEY_MASK) << TID_OPFN_JKEY_SHIFT) \|
170	(((u64)p->max_read & TID_OPFN_MAX_READ_MASK) <<
171	TID_OPFN_MAX_READ_SHIFT) \|
172	(((u64)p->max_write & TID_OPFN_MAX_WRITE_MASK) <<
173	TID_OPFN_MAX_WRITE_SHIFT);
174	}
175
176	static void tid_rdma_opfn_decode(struct tid_rdma_params *p, u64 data)
177	{
178	p->max_len = (((data >> TID_OPFN_MAX_LEN_SHIFT) &
179	TID_OPFN_MAX_LEN_MASK) + `1`) << PAGE_SHIFT;
180	p->jkey = (data >> TID_OPFN_JKEY_SHIFT) & TID_OPFN_JKEY_MASK;
181	p->max_write = (data >> TID_OPFN_MAX_WRITE_SHIFT) &
182	TID_OPFN_MAX_WRITE_MASK;
183	p->max_read = (data >> TID_OPFN_MAX_READ_SHIFT) &
184	TID_OPFN_MAX_READ_MASK;
185	p->qp =
186	((((data >> TID_OPFN_QP_KDETH_SHIFT) & TID_OPFN_QP_KDETH_MASK)
187	<< `16`) \|
188	((data >> TID_OPFN_QP_CTXT_SHIFT) & TID_OPFN_QP_CTXT_MASK));
189	p->urg = (data >> TID_OPFN_URG_SHIFT) & TID_OPFN_URG_MASK;
190	p->timeout = (data >> TID_OPFN_TIMEOUT_SHIFT) & TID_OPFN_TIMEOUT_MASK;
191	}
192
193	void tid_rdma_opfn_init(struct rvt_qp qp, struct* tid_rdma_params *p)
194	{
195	struct hfi1_qp_priv *priv = qp->priv;
196
197	p->qp = (RVT_KDETH_QP_PREFIX << `16`) \| priv->rcd->ctxt;
198	p->max_len = TID_RDMA_MAX_SEGMENT_SIZE;
199	p->jkey = priv->rcd->jkey;
200	p->max_read = TID_RDMA_MAX_READ_SEGS_PER_REQ;
201	p->max_write = TID_RDMA_MAX_WRITE_SEGS_PER_REQ;
202	p->timeout = qp->timeout;
203	p->urg = is_urg_masked(rcd: priv->rcd);
204	}
205
206	bool tid_rdma_conn_req(struct rvt_qp qp, u64 data)
207	{
208	struct hfi1_qp_priv *priv = qp->priv;
209
210	*data = tid_rdma_opfn_encode(p: &priv->tid_rdma.local);
211	return true;
212	}
213
214	bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data)
215	{
216	struct hfi1_qp_priv *priv = qp->priv;
217	struct tid_rdma_params remote, old;
218	bool ret = true;
219
220	old = rcu_dereference_protected(priv->tid_rdma.remote,
221	lockdep_is_held(&priv->opfn.lock));
222	data &= ~`0xfULL`;
223	/*
224	* If data passed in is zero, return true so as not to continue the
225	* negotiation process
226	*/
227	if (!data \|\| !HFI1_CAP_IS_KSET(TID_RDMA))
228	goto null;
229	/*
230	* If kzalloc fails, return false. This will result in:
231	* * at the requester a new OPFN request being generated to retry
232	* the negotiation
233	* * at the responder, 0 being returned to the requester so as to
234	* disable TID RDMA at both the requester and the responder
235	*/
236	remote = kzalloc(size: sizeof(*remote), GFP_ATOMIC);
237	if (!remote) {
238	ret = false;
239	goto null;
240	}
241
242	tid_rdma_opfn_decode(p: remote, data);
243	priv->tid_timer_timeout_jiffies =
244	usecs_to_jiffies(u: (((`4096UL` * (`1UL` << remote->timeout)) /
245	`1000UL`) << `3`) * `7`);
246	trace_hfi1_opfn_param(qp, remote: `0`, param: &priv->tid_rdma.local);
247	trace_hfi1_opfn_param(qp, remote: `1`, param: remote);
248	rcu_assign_pointer(priv->tid_rdma.remote, remote);
249	/*
250	* A TID RDMA READ request's segment size is not equal to
251	* remote->max_len only when the request's data length is smaller
252	* than remote->max_len. In that case, there will be only one segment.
253	* Therefore, when priv->pkts_ps is used to calculate req->cur_seg
254	* during retry, it will lead to req->cur_seg = 0, which is exactly
255	* what is expected.
256	*/
257	priv->pkts_ps = (u16)rvt_div_mtu(qp, len: remote->max_len);
258	priv->timeout_shift = ilog2(priv->pkts_ps - `1`) + `1`;
259	goto free;
260	null:
261	RCU_INIT_POINTER(priv->tid_rdma.remote, NULL);
262	priv->timeout_shift = `0`;
263	free:
264	if (old)
265	kfree_rcu(old, rcu_head);
266	return ret;
267	}
268
269	bool tid_rdma_conn_resp(struct rvt_qp qp, u64 data)
270	{
271	bool ret;
272
273	ret = tid_rdma_conn_reply(qp, data: *data);
274	*data = `0`;
275	/*
276	* If tid_rdma_conn_reply() returns error, set *data as 0 to indicate
277	* TID RDMA could not be enabled. This will result in TID RDMA being
278	* disabled at the requester too.
279	*/
280	if (ret)
281	(void)tid_rdma_conn_req(qp, data);
282	return ret;
283	}
284
285	void tid_rdma_conn_error(struct rvt_qp *qp)
286	{
287	struct hfi1_qp_priv *priv = qp->priv;
288	struct tid_rdma_params *old;
289
290	old = rcu_dereference_protected(priv->tid_rdma.remote,
291	lockdep_is_held(&priv->opfn.lock));
292	RCU_INIT_POINTER(priv->tid_rdma.remote, NULL);
293	if (old)
294	kfree_rcu(old, rcu_head);
295	}
296
297	/ This is called at context initialization time /
298	int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata rcd, int* reinit)
299	{
300	if (reinit)
301	return `0`;
302
303	BUILD_BUG_ON(TID_RDMA_JKEY < HFI1_KERNEL_MIN_JKEY);
304	BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY);
305	rcd->jkey = TID_RDMA_JKEY;
306	hfi1_set_ctxt_jkey(dd: rcd->dd, rcd, jkey: rcd->jkey);
307	return hfi1_alloc_ctxt_rcv_groups(rcd);
308	}
309
310	/**
311	* qp_to_rcd - determine the receive context used by a qp
312	* @rdi: rvt dev struct
313	* @qp: the qp
314	*
315	* This routine returns the receive context associated
316	* with a a qp's qpn.
317	*
318	* Return: the context.
319	*/
320	static struct hfi1_ctxtdata qp_to_rcd(struct* rvt_dev_info *rdi,
321	struct rvt_qp *qp)
322	{
323	struct hfi1_ibdev *verbs_dev = container_of(rdi,
324	struct hfi1_ibdev,
325	rdi);
326	struct hfi1_devdata *dd = container_of(verbs_dev,
327	struct hfi1_devdata,
328	verbs_dev);
329	unsigned int ctxt;
330
331	if (qp->ibqp.qp_num == `0`)
332	ctxt = `0`;
333	else
334	ctxt = hfi1_get_qp_map(dd, idx: qp->ibqp.qp_num >> dd->qos_shift);
335	return dd->rcd[ctxt];
336	}
337
338	int hfi1_qp_priv_init(struct rvt_dev_info rdi, struct* rvt_qp *qp,
339	struct ib_qp_init_attr *init_attr)
340	{
341	struct hfi1_qp_priv *qpriv = qp->priv;
342	int i, ret;
343
344	qpriv->rcd = qp_to_rcd(rdi, qp);
345
346	spin_lock_init(&qpriv->opfn.lock);
347	INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request);
348	INIT_WORK(&qpriv->tid_rdma.trigger_work, tid_rdma_trigger_resume);
349	qpriv->flow_state.psn = `0`;
350	qpriv->flow_state.index = RXE_NUM_TID_FLOWS;
351	qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS;
352	qpriv->flow_state.generation = KERN_GENERATION_RESERVED;
353	qpriv->s_state = TID_OP(WRITE_RESP);
354	qpriv->s_tid_cur = HFI1_QP_WQE_INVALID;
355	qpriv->s_tid_head = HFI1_QP_WQE_INVALID;
356	qpriv->s_tid_tail = HFI1_QP_WQE_INVALID;
357	qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
358	qpriv->r_tid_head = HFI1_QP_WQE_INVALID;
359	qpriv->r_tid_tail = HFI1_QP_WQE_INVALID;
360	qpriv->r_tid_ack = HFI1_QP_WQE_INVALID;
361	qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID;
362	atomic_set(v: &qpriv->n_requests, i: `0`);
363	atomic_set(v: &qpriv->n_tid_requests, i: `0`);
364	timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, `0`);
365	timer_setup(&qpriv->s_tid_retry_timer, hfi1_tid_retry_timeout, `0`);
366	INIT_LIST_HEAD(list: &qpriv->tid_wait);
367
368	if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
369	struct hfi1_devdata *dd = qpriv->rcd->dd;
370
371	qpriv->pages = kzalloc_node(TID_RDMA_MAX_PAGES *
372	sizeof(*qpriv->pages),
373	GFP_KERNEL, node: dd->node);
374	if (!qpriv->pages)
375	return -ENOMEM;
376	for (i = `0`; i < qp->s_size; i++) {
377	struct hfi1_swqe_priv *priv;
378	struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n: i);
379
380	priv = kzalloc_node(size: sizeof(*priv), GFP_KERNEL,
381	node: dd->node);
382	if (!priv)
383	return -ENOMEM;
384
385	hfi1_init_trdma_req(qp, req: &priv->tid_req);
386	priv->tid_req.e.swqe = wqe;
387	wqe->priv = priv;
388	}
389	for (i = `0`; i < rvt_max_atomic(rdi); i++) {
390	struct hfi1_ack_priv *priv;
391
392	priv = kzalloc_node(size: sizeof(*priv), GFP_KERNEL,
393	node: dd->node);
394	if (!priv)
395	return -ENOMEM;
396
397	hfi1_init_trdma_req(qp, req: &priv->tid_req);
398	priv->tid_req.e.ack = &qp->s_ack_queue[i];
399
400	ret = hfi1_kern_exp_rcv_alloc_flows(req: &priv->tid_req,
401	GFP_KERNEL);
402	if (ret) {
403	kfree(objp: priv);
404	return ret;
405	}
406	qp->s_ack_queue[i].priv = priv;
407	}
408	}
409
410	return `0`;
411	}
412
413	void hfi1_qp_priv_tid_free(struct rvt_dev_info rdi, struct* rvt_qp *qp)
414	{
415	struct hfi1_qp_priv *qpriv = qp->priv;
416	struct rvt_swqe *wqe;
417	u32 i;
418
419	if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
420	for (i = `0`; i < qp->s_size; i++) {
421	wqe = rvt_get_swqe_ptr(qp, n: i);
422	kfree(objp: wqe->priv);
423	wqe->priv = NULL;
424	}
425	for (i = `0`; i < rvt_max_atomic(rdi); i++) {
426	struct hfi1_ack_priv *priv = qp->s_ack_queue[i].priv;
427
428	if (priv)
429	hfi1_kern_exp_rcv_free_flows(req: &priv->tid_req);
430	kfree(objp: priv);
431	qp->s_ack_queue[i].priv = NULL;
432	}
433	cancel_work_sync(work: &qpriv->opfn.opfn_work);
434	kfree(objp: qpriv->pages);
435	qpriv->pages = NULL;
436	}
437	}
438
439	/ Flow and tid waiter functions /
440	/**
441	* DOC: lock ordering
442	*
443	* There are two locks involved with the queuing
444	* routines: the qp s_lock and the exp_lock.
445	*
446	* Since the tid space allocation is called from
447	* the send engine, the qp s_lock is already held.
448	*
449	* The allocation routines will get the exp_lock.
450	*
451	* The first_qp() call is provided to allow the head of
452	* the rcd wait queue to be fetched under the exp_lock and
453	* followed by a drop of the exp_lock.
454	*
455	* Any qp in the wait list will have the qp reference count held
456	* to hold the qp in memory.
457	*/
458
459	/*
460	* return head of rcd wait list
461	*
462	* Must hold the exp_lock.
463	*
464	* Get a reference to the QP to hold the QP in memory.
465	*
466	* The caller must release the reference when the local
467	* is no longer being used.
468	*/
469	static struct rvt_qp first_qp(struct* hfi1_ctxtdata *rcd,
470	struct tid_queue *queue)
471	__must_hold(&rcd->exp_lock)
472	{
473	struct hfi1_qp_priv *priv;
474
475	lockdep_assert_held(&rcd->exp_lock);
476	priv = list_first_entry_or_null(&queue->queue_head,
477	struct hfi1_qp_priv,
478	tid_wait);
479	if (!priv)
480	return NULL;
481	rvt_get_qp(qp: priv->owner);
482	return priv->owner;
483	}
484
485	/**
486	* kernel_tid_waiters - determine rcd wait
487	* @rcd: the receive context
488	* @queue: the queue to operate on
489	* @qp: the head of the qp being processed
490	*
491	* This routine will return false IFF
492	* the list is NULL or the head of the
493	* list is the indicated qp.
494	*
495	* Must hold the qp s_lock and the exp_lock.
496	*
497	* Return:
498	* false if either of the conditions below are satisfied:
499	* 1. The list is empty or
500	* 2. The indicated qp is at the head of the list and the
501	* HFI1_S_WAIT_TID_SPACE bit is set in qp->s_flags.
502	* true is returned otherwise.
503	*/
504	static bool kernel_tid_waiters(struct hfi1_ctxtdata *rcd,
505	struct tid_queue queue, struct* rvt_qp *qp)
506	__must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
507	{
508	struct rvt_qp *fqp;
509	bool ret = true;
510
511	lockdep_assert_held(&qp->s_lock);
512	lockdep_assert_held(&rcd->exp_lock);
513	fqp = first_qp(rcd, queue);
514	if (!fqp \|\| (fqp == qp && (qp->s_flags & HFI1_S_WAIT_TID_SPACE)))
515	ret = false;
516	rvt_put_qp(qp: fqp);
517	return ret;
518	}
519
520	/**
521	* dequeue_tid_waiter - dequeue the qp from the list
522	* @rcd: the receive context
523	* @queue: the queue to operate on
524	* @qp: the qp to remove the wait list
525	*
526	* This routine removes the indicated qp from the
527	* wait list if it is there.
528	*
529	* This should be done after the hardware flow and
530	* tid array resources have been allocated.
531	*
532	* Must hold the qp s_lock and the rcd exp_lock.
533	*
534	* It assumes the s_lock to protect the s_flags
535	* field and to reliably test the HFI1_S_WAIT_TID_SPACE flag.
536	*/
537	static void dequeue_tid_waiter(struct hfi1_ctxtdata *rcd,
538	struct tid_queue queue, struct* rvt_qp *qp)
539	__must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
540	{
541	struct hfi1_qp_priv *priv = qp->priv;
542
543	lockdep_assert_held(&qp->s_lock);
544	lockdep_assert_held(&rcd->exp_lock);
545	if (list_empty(head: &priv->tid_wait))
546	return;
547	list_del_init(entry: &priv->tid_wait);
548	qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
549	queue->dequeue++;
550	rvt_put_qp(qp);
551	}
552
553	/**
554	* queue_qp_for_tid_wait - suspend QP on tid space
555	* @rcd: the receive context
556	* @queue: the queue to operate on
557	* @qp: the qp
558	*
559	* The qp is inserted at the tail of the rcd
560	* wait queue and the HFI1_S_WAIT_TID_SPACE s_flag is set.
561	*
562	* Must hold the qp s_lock and the exp_lock.
563	*/
564	static void queue_qp_for_tid_wait(struct hfi1_ctxtdata *rcd,
565	struct tid_queue queue, struct* rvt_qp *qp)
566	__must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
567	{
568	struct hfi1_qp_priv *priv = qp->priv;
569
570	lockdep_assert_held(&qp->s_lock);
571	lockdep_assert_held(&rcd->exp_lock);
572	if (list_empty(head: &priv->tid_wait)) {
573	qp->s_flags \|= HFI1_S_WAIT_TID_SPACE;
574	list_add_tail(new: &priv->tid_wait, head: &queue->queue_head);
575	priv->tid_enqueue = ++queue->enqueue;
576	rcd->dd->verbs_dev.n_tidwait++;
577	trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TID_SPACE);
578	rvt_get_qp(qp);
579	}
580	}
581
582	/**
583	* __trigger_tid_waiter - trigger tid waiter
584	* @qp: the qp
585	*
586	* This is a private entrance to schedule the qp
587	* assuming the caller is holding the qp->s_lock.
588	*/
589	static void __trigger_tid_waiter(struct rvt_qp *qp)
590	__must_hold(&qp->s_lock)
591	{
592	lockdep_assert_held(&qp->s_lock);
593	if (!(qp->s_flags & HFI1_S_WAIT_TID_SPACE))
594	return;
595	trace_hfi1_qpwakeup(qp, HFI1_S_WAIT_TID_SPACE);
596	hfi1_schedule_send(qp);
597	}
598
599	/**
600	* tid_rdma_schedule_tid_wakeup - schedule wakeup for a qp
601	* @qp: the qp
602	*
603	* trigger a schedule or a waiting qp in a deadlock
604	* safe manner. The qp reference is held prior
605	* to this call via first_qp().
606	*
607	* If the qp trigger was already scheduled (!rval)
608	* the reference is dropped, otherwise the resume
609	* or the destroy cancel will dispatch the reference.
610	*/
611	static void tid_rdma_schedule_tid_wakeup(struct rvt_qp *qp)
612	{
613	struct hfi1_qp_priv *priv;
614	struct hfi1_ibport *ibp;
615	struct hfi1_pportdata *ppd;
616	struct hfi1_devdata *dd;
617	bool rval;
618
619	if (!qp)
620	return;
621
622	priv = qp->priv;
623	ibp = to_iport(ibdev: qp->ibqp.device, port: qp->port_num);
624	ppd = ppd_from_ibp(ibp);
625	dd = dd_from_ibdev(ibdev: qp->ibqp.device);
626
627	rval = queue_work_on(cpu: priv->s_sde ?
628	priv->s_sde->cpu :
629	cpumask_first(srcp: cpumask_of_node(node: dd->node)),
630	wq: ppd->hfi1_wq,
631	work: &priv->tid_rdma.trigger_work);
632	if (!rval)
633	rvt_put_qp(qp);
634	}
635
636	/**
637	* tid_rdma_trigger_resume - field a trigger work request
638	* @work: the work item
639	*
640	* Complete the off qp trigger processing by directly
641	* calling the progress routine.
642	*/
643	static void tid_rdma_trigger_resume(struct work_struct *work)
644	{
645	struct tid_rdma_qp_params *tr;
646	struct hfi1_qp_priv *priv;
647	struct rvt_qp *qp;
648
649	tr = container_of(work, struct tid_rdma_qp_params, trigger_work);
650	priv = container_of(tr, struct hfi1_qp_priv, tid_rdma);
651	qp = priv->owner;
652	spin_lock_irq(lock: &qp->s_lock);
653	if (qp->s_flags & HFI1_S_WAIT_TID_SPACE) {
654	spin_unlock_irq(lock: &qp->s_lock);
655	hfi1_do_send(qp: priv->owner, in_thread: true);
656	} else {
657	spin_unlock_irq(lock: &qp->s_lock);
658	}
659	rvt_put_qp(qp);
660	}
661
662	/*
663	* tid_rdma_flush_wait - unwind any tid space wait
664	*
665	* This is called when resetting a qp to
666	* allow a destroy or reset to get rid
667	* of any tid space linkage and reference counts.
668	*/
669	static void _tid_rdma_flush_wait(struct rvt_qp qp, struct* tid_queue *queue)
670	__must_hold(&qp->s_lock)
671	{
672	struct hfi1_qp_priv *priv;
673
674	if (!qp)
675	return;
676	lockdep_assert_held(&qp->s_lock);
677	priv = qp->priv;
678	qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
679	spin_lock(lock: &priv->rcd->exp_lock);
680	if (!list_empty(head: &priv->tid_wait)) {
681	list_del_init(entry: &priv->tid_wait);
682	qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
683	queue->dequeue++;
684	rvt_put_qp(qp);
685	}
686	spin_unlock(lock: &priv->rcd->exp_lock);
687	}
688
689	void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp)
690	__must_hold(&qp->s_lock)
691	{
692	struct hfi1_qp_priv *priv = qp->priv;
693
694	_tid_rdma_flush_wait(qp, queue: &priv->rcd->flow_queue);
695	_tid_rdma_flush_wait(qp, queue: &priv->rcd->rarr_queue);
696	}
697
698	/ Flow functions /
699	/**
700	* kern_reserve_flow - allocate a hardware flow
701	* @rcd: the context to use for allocation
702	* @last: the index of the preferred flow. Use RXE_NUM_TID_FLOWS to
703	* signify "don't care".
704	*
705	* Use a bit mask based allocation to reserve a hardware
706	* flow for use in receiving KDETH data packets. If a preferred flow is
707	* specified the function will attempt to reserve that flow again, if
708	* available.
709	*
710	* The exp_lock must be held.
711	*
712	* Return:
713	* On success: a value positive value between 0 and RXE_NUM_TID_FLOWS - 1
714	* On failure: -EAGAIN
715	*/
716	static int kern_reserve_flow(struct hfi1_ctxtdata rcd, int* last)
717	__must_hold(&rcd->exp_lock)
718	{
719	int nr;
720
721	/ Attempt to reserve the preferred flow index /
722	if (last >= `0` && last < RXE_NUM_TID_FLOWS &&
723	!test_and_set_bit(nr: last, addr: &rcd->flow_mask))
724	return last;
725
726	nr = ffz(rcd->flow_mask);
727	BUILD_BUG_ON(RXE_NUM_TID_FLOWS >=
728	(sizeof(rcd->flow_mask) * BITS_PER_BYTE));
729	if (nr > (RXE_NUM_TID_FLOWS - `1`))
730	return -EAGAIN;
731	set_bit(nr, addr: &rcd->flow_mask);
732	return nr;
733	}
734
735	static void kern_set_hw_flow(struct hfi1_ctxtdata *rcd, u32 generation,
736	u32 flow_idx)
737	{
738	u64 reg;
739
740	reg = ((u64)generation << HFI1_KDETH_BTH_SEQ_SHIFT) \|
741	RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK \|
742	RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK \|
743	RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK \|
744	RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK \|
745	RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK;
746
747	if (generation != KERN_GENERATION_RESERVED)
748	reg \|= RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK;
749
750	write_uctxt_csr(dd: rcd->dd, ctxt: rcd->ctxt,
751	RCV_TID_FLOW_TABLE + `8` * flow_idx, value: reg);
752	}
753
754	static u32 kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
755	__must_hold(&rcd->exp_lock)
756	{
757	u32 generation = rcd->flows[flow_idx].generation;
758
759	kern_set_hw_flow(rcd, generation, flow_idx);
760	return generation;
761	}
762
763	static u32 kern_flow_generation_next(u32 gen)
764	{
765	u32 generation = mask_generation(a: gen + `1`);
766
767	if (generation == KERN_GENERATION_RESERVED)
768	generation = mask_generation(a: generation + `1`);
769	return generation;
770	}
771
772	static void kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
773	__must_hold(&rcd->exp_lock)
774	{
775	rcd->flows[flow_idx].generation =
776	kern_flow_generation_next(gen: rcd->flows[flow_idx].generation);
777	kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, flow_idx);
778	}
779
780	int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata rcd, struct* rvt_qp *qp)
781	{
782	struct hfi1_qp_priv qpriv = (struct* hfi1_qp_priv *)qp->priv;
783	struct tid_flow_state *fs = &qpriv->flow_state;
784	struct rvt_qp *fqp;
785	unsigned long flags;
786	int ret = `0`;
787
788	/ The QP already has an allocated flow /
789	if (fs->index != RXE_NUM_TID_FLOWS)
790	return ret;
791
792	spin_lock_irqsave(&rcd->exp_lock, flags);
793	if (kernel_tid_waiters(rcd, queue: &rcd->flow_queue, qp))
794	goto queue;
795
796	ret = kern_reserve_flow(rcd, last: fs->last_index);
797	if (ret < `0`)
798	goto queue;
799	fs->index = ret;
800	fs->last_index = fs->index;
801
802	/ Generation received in a RESYNC overrides default flow generation /
803	if (fs->generation != KERN_GENERATION_RESERVED)
804	rcd->flows[fs->index].generation = fs->generation;
805	fs->generation = kern_setup_hw_flow(rcd, flow_idx: fs->index);
806	fs->psn = `0`;
807	dequeue_tid_waiter(rcd, queue: &rcd->flow_queue, qp);
808	/ get head before dropping lock /
809	fqp = first_qp(rcd, queue: &rcd->flow_queue);
810	spin_unlock_irqrestore(lock: &rcd->exp_lock, flags);
811
812	tid_rdma_schedule_tid_wakeup(qp: fqp);
813	return `0`;
814	queue:
815	queue_qp_for_tid_wait(rcd, queue: &rcd->flow_queue, qp);
816	spin_unlock_irqrestore(lock: &rcd->exp_lock, flags);
817	return -EAGAIN;
818	}
819
820	void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata rcd, struct* rvt_qp *qp)
821	{
822	struct hfi1_qp_priv qpriv = (struct* hfi1_qp_priv *)qp->priv;
823	struct tid_flow_state *fs = &qpriv->flow_state;
824	struct rvt_qp *fqp;
825	unsigned long flags;
826
827	if (fs->index >= RXE_NUM_TID_FLOWS)
828	return;
829	spin_lock_irqsave(&rcd->exp_lock, flags);
830	kern_clear_hw_flow(rcd, flow_idx: fs->index);
831	clear_bit(nr: fs->index, addr: &rcd->flow_mask);
832	fs->index = RXE_NUM_TID_FLOWS;
833	fs->psn = `0`;
834	fs->generation = KERN_GENERATION_RESERVED;
835
836	/ get head before dropping lock /
837	fqp = first_qp(rcd, queue: &rcd->flow_queue);
838	spin_unlock_irqrestore(lock: &rcd->exp_lock, flags);
839
840	if (fqp == qp) {
841	__trigger_tid_waiter(qp: fqp);
842	rvt_put_qp(qp: fqp);
843	} else {
844	tid_rdma_schedule_tid_wakeup(qp: fqp);
845	}
846	}
847
848	void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd)
849	{
850	int i;
851
852	for (i = `0`; i < RXE_NUM_TID_FLOWS; i++) {
853	rcd->flows[i].generation = mask_generation(a: get_random_u32());
854	kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, flow_idx: i);
855	}
856	}
857
858	/ TID allocation functions /
859	static u8 trdma_pset_order(struct tid_rdma_pageset *s)
860	{
861	u8 count = s->count;
862
863	return ilog2(count) + `1`;
864	}
865
866	/**
867	* tid_rdma_find_phys_blocks_4k - get groups base on mr info
868	* @flow: overall info for a TID RDMA segment
869	* @pages: pointer to an array of page structs
870	* @npages: number of pages
871	* @list: page set array to return
872	*
873	* This routine returns the number of groups associated with
874	* the current sge information. This implementation is based
875	* on the expected receive find_phys_blocks() adjusted to
876	* use the MR information vs. the pfn.
877	*
878	* Return:
879	* the number of RcvArray entries
880	*/
881	static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow,
882	struct page **pages,
883	u32 npages,
884	struct tid_rdma_pageset *list)
885	{
886	u32 pagecount, pageidx, setcount = `0`, i;
887	void vaddr, this_vaddr;
888
889	if (!npages)
890	return `0`;
891
892	/*
893	* Look for sets of physically contiguous pages in the user buffer.
894	* This will allow us to optimize Expected RcvArray entry usage by
895	* using the bigger supported sizes.
896	*/
897	vaddr = page_address(pages[`0`]);
898	trace_hfi1_tid_flow_page(qp: flow->req->qp, flow, index: `0`, mtu8k: `0`, v1: `0`, vaddr);
899	for (pageidx = `0`, pagecount = `1`, i = `1`; i <= npages; i++) {
900	this_vaddr = i < npages ? page_address(pages[i]) : NULL;
901	trace_hfi1_tid_flow_page(qp: flow->req->qp, flow, index: i, mtu8k: `0`, v1: `0`,
902	vaddr: this_vaddr);
903	/*
904	* If the vaddr's are not sequential, pages are not physically
905	* contiguous.
906	*/
907	if (this_vaddr != (vaddr + PAGE_SIZE)) {
908	/*
909	* At this point we have to loop over the set of
910	* physically contiguous pages and break them down it
911	* sizes supported by the HW.
912	* There are two main constraints:
913	* 1. The max buffer size is MAX_EXPECTED_BUFFER.
914	* If the total set size is bigger than that
915	* program only a MAX_EXPECTED_BUFFER chunk.
916	* 2. The buffer size has to be a power of two. If
917	* it is not, round down to the closes power of
918	* 2 and program that size.
919	*/
920	while (pagecount) {
921	int maxpages = pagecount;
922	u32 bufsize = pagecount * PAGE_SIZE;
923
924	if (bufsize > MAX_EXPECTED_BUFFER)
925	maxpages =
926	MAX_EXPECTED_BUFFER >>
927	PAGE_SHIFT;
928	else if (!is_power_of_2(n: bufsize))
929	maxpages =
930	rounddown_pow_of_two(bufsize) >>
931	PAGE_SHIFT;
932
933	list[setcount].idx = pageidx;
934	list[setcount].count = maxpages;
935	trace_hfi1_tid_pageset(qp: flow->req->qp, index: setcount,
936	idx: list[setcount].idx,
937	count: list[setcount].count);
938	pagecount -= maxpages;
939	pageidx += maxpages;
940	setcount++;
941	}
942	pageidx = i;
943	pagecount = `1`;
944	vaddr = this_vaddr;
945	} else {
946	vaddr += PAGE_SIZE;
947	pagecount++;
948	}
949	}
950	/ insure we always return an even number of sets /
951	if (setcount & `1`)
952	list[setcount++].count = `0`;
953	return setcount;
954	}
955
956	/**
957	* tid_flush_pages - dump out pages into pagesets
958	* @list: list of pagesets
959	* @idx: pointer to current page index
960	* @pages: number of pages to dump
961	* @sets: current number of pagesset
962	*
963	* This routine flushes out accumuated pages.
964	*
965	* To insure an even number of sets the
966	* code may add a filler.
967	*
968	* This can happen with when pages is not
969	* a power of 2 or pages is a power of 2
970	* less than the maximum pages.
971	*
972	* Return:
973	* The new number of sets
974	*/
975
976	static u32 tid_flush_pages(struct tid_rdma_pageset *list,
977	u32 *idx, u32 pages, u32 sets)
978	{
979	while (pages) {
980	u32 maxpages = pages;
981
982	if (maxpages > MAX_EXPECTED_PAGES)
983	maxpages = MAX_EXPECTED_PAGES;
984	else if (!is_power_of_2(n: maxpages))
985	maxpages = rounddown_pow_of_two(maxpages);
986	list[sets].idx = *idx;
987	list[sets++].count = maxpages;
988	*idx += maxpages;
989	pages -= maxpages;
990	}
991	/ might need a filler /
992	if (sets & `1`)
993	list[sets++].count = `0`;
994	return sets;
995	}
996
997	/**
998	* tid_rdma_find_phys_blocks_8k - get groups base on mr info
999	* @flow: overall info for a TID RDMA segment
1000	* @pages: pointer to an array of page structs
1001	* @npages: number of pages
1002	* @list: page set array to return
1003	*
1004	* This routine parses an array of pages to compute pagesets
1005	* in an 8k compatible way.
1006	*
1007	* pages are tested two at a time, i, i + 1 for contiguous
1008	* pages and i - 1 and i contiguous pages.
1009	*
1010	* If any condition is false, any accumulated pages are flushed and
1011	* v0,v1 are emitted as separate PAGE_SIZE pagesets
1012	*
1013	* Otherwise, the current 8k is totaled for a future flush.
1014	*
1015	* Return:
1016	* The number of pagesets
1017	* list set with the returned number of pagesets
1018	*
1019	*/
1020	static u32 tid_rdma_find_phys_blocks_8k(struct tid_rdma_flow *flow,
1021	struct page **pages,
1022	u32 npages,
1023	struct tid_rdma_pageset *list)
1024	{
1025	u32 idx, sets = `0`, i;
1026	u32 pagecnt = `0`;
1027	void v0, v1, *vm1;
1028
1029	if (!npages)
1030	return `0`;
1031	for (idx = `0`, i = `0`, vm1 = NULL; i < npages; i += `2`) {
1032	/ get a new v0 /
1033	v0 = page_address(pages[i]);
1034	trace_hfi1_tid_flow_page(qp: flow->req->qp, flow, index: i, mtu8k: `1`, v1: `0`, vaddr: v0);
1035	v1 = i + `1` < npages ?
1036	page_address(pages[i + `1`]) : NULL;
1037	trace_hfi1_tid_flow_page(qp: flow->req->qp, flow, index: i, mtu8k: `1`, v1: `1`, vaddr: v1);
1038	/ compare i, i + 1 vaddr /
1039	if (v1 != (v0 + PAGE_SIZE)) {
1040	/ flush out pages /
1041	sets = tid_flush_pages(list, idx: &idx, pages: pagecnt, sets);
1042	/ output v0,v1 as two pagesets /
1043	list[sets].idx = idx++;
1044	list[sets++].count = `1`;
1045	if (v1) {
1046	list[sets].count = `1`;
1047	list[sets++].idx = idx++;
1048	} else {
1049	list[sets++].count = `0`;
1050	}
1051	vm1 = NULL;
1052	pagecnt = `0`;
1053	continue;
1054	}
1055	/ i,i+1 consecutive, look at i-1,i /
1056	if (vm1 && v0 != (vm1 + PAGE_SIZE)) {
1057	/ flush out pages /
1058	sets = tid_flush_pages(list, idx: &idx, pages: pagecnt, sets);
1059	pagecnt = `0`;
1060	}
1061	/ pages will always be a multiple of 8k /
1062	pagecnt += `2`;
1063	/ save i-1 /
1064	vm1 = v1;
1065	/ move to next pair /
1066	}
1067	/ dump residual pages at end /
1068	sets = tid_flush_pages(list, idx: &idx, pages: npages - idx, sets);
1069	/ by design cannot be odd sets /
1070	WARN_ON(sets & `1`);
1071	return sets;
1072	}
1073
1074	/*
1075	* Find pages for one segment of a sge array represented by @ss. The function
1076	* does not check the sge, the sge must have been checked for alignment with a
1077	* prior call to hfi1_kern_trdma_ok. Other sge checking is done as part of
1078	* rvt_lkey_ok and rvt_rkey_ok. Also, the function only modifies the local sge
1079	* copy maintained in @ss->sge, the original sge is not modified.
1080	*
1081	* Unlike IB RDMA WRITE, we can't decrement ss->num_sge here because we are not
1082	* releasing the MR reference count at the same time. Otherwise, we'll "leak"
1083	* references to the MR. This difference requires that we keep track of progress
1084	* into the sg_list. This is done by the cur_seg cursor in the tid_rdma_request
1085	* structure.
1086	*/
1087	static u32 kern_find_pages(struct tid_rdma_flow *flow,
1088	struct page **pages,
1089	struct rvt_sge_state ss, bool last)
1090	{
1091	struct tid_rdma_request *req = flow->req;
1092	struct rvt_sge *sge = &ss->sge;
1093	u32 length = flow->req->seg_len;
1094	u32 len = PAGE_SIZE;
1095	u32 i = `0`;
1096
1097	while (length && req->isge < ss->num_sge) {
1098	pages[i++] = virt_to_page(sge->vaddr);
1099
1100	sge->vaddr += len;
1101	sge->length -= len;
1102	sge->sge_length -= len;
1103	if (!sge->sge_length) {
1104	if (++req->isge < ss->num_sge)
1105	*sge = ss->sg_list[req->isge - `1`];
1106	} else if (sge->length == `0` && sge->mr->lkey) {
1107	if (++sge->n >= RVT_SEGSZ) {
1108	++sge->m;
1109	sge->n = `0`;
1110	}
1111	sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
1112	sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
1113	}
1114	length -= len;
1115	}
1116
1117	flow->length = flow->req->seg_len - length;
1118	*last = req->isge != ss->num_sge;
1119	return i;
1120	}
1121
1122	static void dma_unmap_flow(struct tid_rdma_flow *flow)
1123	{
1124	struct hfi1_devdata *dd;
1125	int i;
1126	struct tid_rdma_pageset *pset;
1127
1128	dd = flow->req->rcd->dd;
1129	for (i = `0`, pset = &flow->pagesets[`0`]; i < flow->npagesets;
1130	i++, pset++) {
1131	if (pset->count && pset->addr) {
1132	dma_unmap_page(&dd->pcidev->dev,
1133	pset->addr,
1134	PAGE_SIZE * pset->count,
1135	DMA_FROM_DEVICE);
1136	pset->mapped = `0`;
1137	}
1138	}
1139	}
1140
1141	static int dma_map_flow(struct tid_rdma_flow flow, struct* page **pages)
1142	{
1143	int i;
1144	struct hfi1_devdata *dd = flow->req->rcd->dd;
1145	struct tid_rdma_pageset *pset;
1146
1147	for (i = `0`, pset = &flow->pagesets[`0`]; i < flow->npagesets;
1148	i++, pset++) {
1149	if (pset->count) {
1150	pset->addr = dma_map_page(&dd->pcidev->dev,
1151	pages[pset->idx],
1152	`0`,
1153	PAGE_SIZE * pset->count,
1154	DMA_FROM_DEVICE);
1155
1156	if (dma_mapping_error(dev: &dd->pcidev->dev, dma_addr: pset->addr)) {
1157	dma_unmap_flow(flow);
1158	return -ENOMEM;
1159	}
1160	pset->mapped = `1`;
1161	}
1162	}
1163	return `0`;
1164	}
1165
1166	static inline bool dma_mapped(struct tid_rdma_flow *flow)
1167	{
1168	return !!flow->pagesets[`0`].mapped;
1169	}
1170
1171	/*
1172	* Get pages pointers and identify contiguous physical memory chunks for a
1173	* segment. All segments are of length flow->req->seg_len.
1174	*/
1175	static int kern_get_phys_blocks(struct tid_rdma_flow *flow,
1176	struct page **pages,
1177	struct rvt_sge_state ss, bool last)
1178	{
1179	u8 npages;
1180
1181	/ Reuse previously computed pagesets, if any /
1182	if (flow->npagesets) {
1183	trace_hfi1_tid_flow_alloc(qp: flow->req->qp, index: flow->req->setup_head,
1184	flow);
1185	if (!dma_mapped(flow))
1186	return dma_map_flow(flow, pages);
1187	return `0`;
1188	}
1189
1190	npages = kern_find_pages(flow, pages, ss, last);
1191
1192	if (flow->req->qp->pmtu == enum_to_mtu(OPA_MTU_4096))
1193	flow->npagesets =
1194	tid_rdma_find_phys_blocks_4k(flow, pages, npages,
1195	list: flow->pagesets);
1196	else
1197	flow->npagesets =
1198	tid_rdma_find_phys_blocks_8k(flow, pages, npages,
1199	list: flow->pagesets);
1200
1201	return dma_map_flow(flow, pages);
1202	}
1203
1204	static inline void kern_add_tid_node(struct tid_rdma_flow *flow,
1205	struct hfi1_ctxtdata rcd, char* *s,
1206	struct tid_group *grp, u8 cnt)
1207	{
1208	struct kern_tid_node *node = &flow->tnode[flow->tnode_cnt++];
1209
1210	WARN_ON_ONCE(flow->tnode_cnt >=
1211	(TID_RDMA_MAX_SEGMENT_SIZE >> PAGE_SHIFT));
1212	if (WARN_ON_ONCE(cnt & `1`))
1213	dd_dev_err(rcd->dd,
1214	"unexpected odd allocation cnt %u map 0x%x used %u",
1215	cnt, grp->map, grp->used);
1216
1217	node->grp = grp;
1218	node->map = grp->map;
1219	node->cnt = cnt;
1220	trace_hfi1_tid_node_add(qp: flow->req->qp, msg: s, index: flow->tnode_cnt - `1`,
1221	base: grp->base, map: grp->map, used: grp->used, cnt);
1222	}
1223
1224	/*
1225	* Try to allocate pageset_count TID's from TID groups for a context
1226	*
1227	* This function allocates TID's without moving groups between lists or
1228	* modifying grp->map. This is done as follows, being cogizant of the lists
1229	* between which the TID groups will move:
1230	* 1. First allocate complete groups of 8 TID's since this is more efficient,
1231	* these groups will move from group->full without affecting used
1232	* 2. If more TID's are needed allocate from used (will move from used->full or
1233	* stay in used)
1234	* 3. If we still don't have the required number of TID's go back and look again
1235	* at a complete group (will move from group->used)
1236	*/
1237	static int kern_alloc_tids(struct tid_rdma_flow *flow)
1238	{
1239	struct hfi1_ctxtdata *rcd = flow->req->rcd;
1240	struct hfi1_devdata *dd = rcd->dd;
1241	u32 ngroups, pageidx = `0`;
1242	struct tid_group group = NULL, used;
1243	u8 use;
1244
1245	flow->tnode_cnt = `0`;
1246	ngroups = flow->npagesets / dd->rcv_entries.group_size;
1247	if (!ngroups)
1248	goto used_list;
1249
1250	/ First look at complete groups /
1251	list_for_each_entry(group, &rcd->tid_group_list.list, list) {
1252	kern_add_tid_node(flow, rcd, s: "complete groups", grp: group,
1253	cnt: group->size);
1254
1255	pageidx += group->size;
1256	if (!--ngroups)
1257	break;
1258	}
1259
1260	if (pageidx >= flow->npagesets)
1261	goto ok;
1262
1263	used_list:
1264	/ Now look at partially used groups /
1265	list_for_each_entry(used, &rcd->tid_used_list.list, list) {
1266	use = min_t(u32, flow->npagesets - pageidx,
1267	used->size - used->used);
1268	kern_add_tid_node(flow, rcd, s: "used groups", grp: used, cnt: use);
1269
1270	pageidx += use;
1271	if (pageidx >= flow->npagesets)
1272	goto ok;
1273	}
1274
1275	/*
1276	* Look again at a complete group, continuing from where we left.
1277	* However, if we are at the head, we have reached the end of the
1278	* complete groups list from the first loop above
1279	*/
1280	if (group && &group->list == &rcd->tid_group_list.list)
1281	goto bail_eagain;
1282	group = list_prepare_entry(group, &rcd->tid_group_list.list,
1283	list);
1284	if (list_is_last(list: &group->list, head: &rcd->tid_group_list.list))
1285	goto bail_eagain;
1286	group = list_next_entry(group, list);
1287	use = min_t(u32, flow->npagesets - pageidx, group->size);
1288	kern_add_tid_node(flow, rcd, s: "complete continue", grp: group, cnt: use);
1289	pageidx += use;
1290	if (pageidx >= flow->npagesets)
1291	goto ok;
1292	bail_eagain:
1293	trace_hfi1_msg_alloc_tids(qp: flow->req->qp, msg: " insufficient tids: needed ",
1294	more: (u64)flow->npagesets);
1295	return -EAGAIN;
1296	ok:
1297	return `0`;
1298	}
1299
1300	static void kern_program_rcv_group(struct tid_rdma_flow flow, int* grp_num,
1301	u32 *pset_idx)
1302	{
1303	struct hfi1_ctxtdata *rcd = flow->req->rcd;
1304	struct hfi1_devdata *dd = rcd->dd;
1305	struct kern_tid_node *node = &flow->tnode[grp_num];
1306	struct tid_group *grp = node->grp;
1307	struct tid_rdma_pageset *pset;
1308	u32 pmtu_pg = flow->req->qp->pmtu >> PAGE_SHIFT;
1309	u32 rcventry, npages = `0`, pair = `0`, tidctrl;
1310	u8 i, cnt = `0`;
1311
1312	for (i = `0`; i < grp->size; i++) {
1313	rcventry = grp->base + i;
1314
1315	if (node->map & BIT(i) \|\| cnt >= node->cnt) {
1316	rcv_array_wc_fill(dd, index: rcventry);
1317	continue;
1318	}
1319	pset = &flow->pagesets[(*pset_idx)++];
1320	if (pset->count) {
1321	hfi1_put_tid(dd, index: rcventry, PT_EXPECTED,
1322	pa: pset->addr, order: trdma_pset_order(s: pset));
1323	} else {
1324	hfi1_put_tid(dd, index: rcventry, PT_INVALID, pa: `0`, order: `0`);
1325	}
1326	npages += pset->count;
1327
1328	rcventry -= rcd->expected_base;
1329	tidctrl = pair ? `0x3` : rcventry & `0x1` ? `0x2` : `0x1`;
1330	/*
1331	* A single TID entry will be used to use a rcvarr pair (with
1332	* tidctrl 0x3), if ALL these are true (a) the bit pos is even
1333	* (b) the group map shows current and the next bits as free
1334	* indicating two consecutive rcvarry entries are available (c)
1335	* we actually need 2 more entries
1336	*/
1337	pair = !(i & `0x1`) && !((node->map >> i) & `0x3`) &&
1338	node->cnt >= cnt + `2`;
1339	if (!pair) {
1340	if (!pset->count)
1341	tidctrl = `0x1`;
1342	flow->tid_entry[flow->tidcnt++] =
1343	EXP_TID_SET(IDX, rcventry >> `1`) \|
1344	EXP_TID_SET(CTRL, tidctrl) \|
1345	EXP_TID_SET(LEN, npages);
1346	trace_hfi1_tid_entry_alloc(/ entry /
1347	qp: flow->req->qp, index: flow->tidcnt - `1`,
1348	entry: flow->tid_entry[flow->tidcnt - `1`]);
1349
1350	/ Efficient DIV_ROUND_UP(npages, pmtu_pg) /
1351	flow->npkts += (npages + pmtu_pg - `1`) >> ilog2(pmtu_pg);
1352	npages = `0`;
1353	}
1354
1355	if (grp->used == grp->size - `1`)
1356	tid_group_move(group: grp, s1: &rcd->tid_used_list,
1357	s2: &rcd->tid_full_list);
1358	else if (!grp->used)
1359	tid_group_move(group: grp, s1: &rcd->tid_group_list,
1360	s2: &rcd->tid_used_list);
1361
1362	grp->used++;
1363	grp->map \|= BIT(i);
1364	cnt++;
1365	}
1366	}
1367
1368	static void kern_unprogram_rcv_group(struct tid_rdma_flow flow, int* grp_num)
1369	{
1370	struct hfi1_ctxtdata *rcd = flow->req->rcd;
1371	struct hfi1_devdata *dd = rcd->dd;
1372	struct kern_tid_node *node = &flow->tnode[grp_num];
1373	struct tid_group *grp = node->grp;
1374	u32 rcventry;
1375	u8 i, cnt = `0`;
1376
1377	for (i = `0`; i < grp->size; i++) {
1378	rcventry = grp->base + i;
1379
1380	if (node->map & BIT(i) \|\| cnt >= node->cnt) {
1381	rcv_array_wc_fill(dd, index: rcventry);
1382	continue;
1383	}
1384
1385	hfi1_put_tid(dd, index: rcventry, PT_INVALID, pa: `0`, order: `0`);
1386
1387	grp->used--;
1388	grp->map &= ~BIT(i);
1389	cnt++;
1390
1391	if (grp->used == grp->size - `1`)
1392	tid_group_move(group: grp, s1: &rcd->tid_full_list,
1393	s2: &rcd->tid_used_list);
1394	else if (!grp->used)
1395	tid_group_move(group: grp, s1: &rcd->tid_used_list,
1396	s2: &rcd->tid_group_list);
1397	}
1398	if (WARN_ON_ONCE(cnt & `1`)) {
1399	struct hfi1_ctxtdata *rcd = flow->req->rcd;
1400	struct hfi1_devdata *dd = rcd->dd;
1401
1402	dd_dev_err(dd, "unexpected odd free cnt %u map 0x%x used %u",
1403	cnt, grp->map, grp->used);
1404	}
1405	}
1406
1407	static void kern_program_rcvarray(struct tid_rdma_flow *flow)
1408	{
1409	u32 pset_idx = `0`;
1410	int i;
1411
1412	flow->npkts = `0`;
1413	flow->tidcnt = `0`;
1414	for (i = `0`; i < flow->tnode_cnt; i++)
1415	kern_program_rcv_group(flow, grp_num: i, pset_idx: &pset_idx);
1416	trace_hfi1_tid_flow_alloc(qp: flow->req->qp, index: flow->req->setup_head, flow);
1417	}
1418
1419	/**
1420	* hfi1_kern_exp_rcv_setup() - setup TID's and flow for one segment of a
1421	* TID RDMA request
1422	*
1423	* @req: TID RDMA request for which the segment/flow is being set up
1424	* @ss: sge state, maintains state across successive segments of a sge
1425	* @last: set to true after the last sge segment has been processed
1426	*
1427	* This function
1428	* (1) finds a free flow entry in the flow circular buffer
1429	* (2) finds pages and continuous physical chunks constituing one segment
1430	* of an sge
1431	* (3) allocates TID group entries for those chunks
1432	* (4) programs rcvarray entries in the hardware corresponding to those
1433	* TID's
1434	* (5) computes a tidarray with formatted TID entries which can be sent
1435	* to the sender
1436	* (6) Reserves and programs HW flows.
1437	* (7) It also manages queueing the QP when TID/flow resources are not
1438	* available.
1439	*
1440	* @req points to struct tid_rdma_request of which the segments are a part. The
1441	* function uses qp, rcd and seg_len members of @req. In the absence of errors,
1442	* req->flow_idx is the index of the flow which has been prepared in this
1443	* invocation of function call. With flow = &req->flows[req->flow_idx],
1444	* flow->tid_entry contains the TID array which the sender can use for TID RDMA
1445	* sends and flow->npkts contains number of packets required to send the
1446	* segment.
1447	*
1448	* hfi1_check_sge_align should be called prior to calling this function and if
1449	* it signals error TID RDMA cannot be used for this sge and this function
1450	* should not be called.
1451	*
1452	* For the queuing, caller must hold the flow->req->qp s_lock from the send
1453	* engine and the function will procure the exp_lock.
1454	*
1455	* Return:
1456	* The function returns -EAGAIN if sufficient number of TID/flow resources to
1457	* map the segment could not be allocated. In this case the function should be
1458	* called again with previous arguments to retry the TID allocation. There are
1459	* no other error returns. The function returns 0 on success.
1460	*/
1461	int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req,
1462	struct rvt_sge_state ss, bool last)
1463	__must_hold(&req->qp->s_lock)
1464	{
1465	struct tid_rdma_flow *flow = &req->flows[req->setup_head];
1466	struct hfi1_ctxtdata *rcd = req->rcd;
1467	struct hfi1_qp_priv *qpriv = req->qp->priv;
1468	unsigned long flags;
1469	struct rvt_qp *fqp;
1470	u16 clear_tail = req->clear_tail;
1471
1472	lockdep_assert_held(&req->qp->s_lock);
1473	/*
1474	* We return error if either (a) we don't have space in the flow
1475	* circular buffer, or (b) we already have max entries in the buffer.
1476	* Max entries depend on the type of request we are processing and the
1477	* negotiated TID RDMA parameters.
1478	*/
1479	if (!CIRC_SPACE(req->setup_head, clear_tail, MAX_FLOWS) \|\|
1480	CIRC_CNT(req->setup_head, clear_tail, MAX_FLOWS) >=
1481	req->n_flows)
1482	return -EINVAL;
1483
1484	/*
1485	* Get pages, identify contiguous physical memory chunks for the segment
1486	* If we can not determine a DMA address mapping we will treat it just
1487	* like if we ran out of space above.
1488	*/
1489	if (kern_get_phys_blocks(flow, pages: qpriv->pages, ss, last)) {
1490	hfi1_wait_kmem(qp: flow->req->qp);
1491	return -ENOMEM;
1492	}
1493
1494	spin_lock_irqsave(&rcd->exp_lock, flags);
1495	if (kernel_tid_waiters(rcd, queue: &rcd->rarr_queue, qp: flow->req->qp))
1496	goto queue;
1497
1498	/*
1499	* At this point we know the number of pagesets and hence the number of
1500	* TID's to map the segment. Allocate the TID's from the TID groups. If
1501	* we cannot allocate the required number we exit and try again later
1502	*/
1503	if (kern_alloc_tids(flow))
1504	goto queue;
1505	/*
1506	* Finally program the TID entries with the pagesets, compute the
1507	* tidarray and enable the HW flow
1508	*/
1509	kern_program_rcvarray(flow);
1510
1511	/*
1512	* Setup the flow state with relevant information.
1513	* This information is used for tracking the sequence of data packets
1514	* for the segment.
1515	* The flow is setup here as this is the most accurate time and place
1516	* to do so. Doing at a later time runs the risk of the flow data in
1517	* qpriv getting out of sync.
1518	*/
1519	memset(&flow->flow_state, `0x0`, sizeof(flow->flow_state));
1520	flow->idx = qpriv->flow_state.index;
1521	flow->flow_state.generation = qpriv->flow_state.generation;
1522	flow->flow_state.spsn = qpriv->flow_state.psn;
1523	flow->flow_state.lpsn = flow->flow_state.spsn + flow->npkts - `1`;
1524	flow->flow_state.r_next_psn =
1525	full_flow_psn(flow, psn: flow->flow_state.spsn);
1526	qpriv->flow_state.psn += flow->npkts;
1527
1528	dequeue_tid_waiter(rcd, queue: &rcd->rarr_queue, qp: flow->req->qp);
1529	/ get head before dropping lock /
1530	fqp = first_qp(rcd, queue: &rcd->rarr_queue);
1531	spin_unlock_irqrestore(lock: &rcd->exp_lock, flags);
1532	tid_rdma_schedule_tid_wakeup(qp: fqp);
1533
1534	req->setup_head = (req->setup_head + `1`) & (MAX_FLOWS - `1`);
1535	return `0`;
1536	queue:
1537	queue_qp_for_tid_wait(rcd, queue: &rcd->rarr_queue, qp: flow->req->qp);
1538	spin_unlock_irqrestore(lock: &rcd->exp_lock, flags);
1539	return -EAGAIN;
1540	}
1541
1542	static void hfi1_tid_rdma_reset_flow(struct tid_rdma_flow *flow)
1543	{
1544	flow->npagesets = `0`;
1545	}
1546
1547	/*
1548	* This function is called after one segment has been successfully sent to
1549	* release the flow and TID HW/SW resources for that segment. The segments for a
1550	* TID RDMA request are setup and cleared in FIFO order which is managed using a
1551	* circular buffer.
1552	*/
1553	int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req)
1554	__must_hold(&req->qp->s_lock)
1555	{
1556	struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
1557	struct hfi1_ctxtdata *rcd = req->rcd;
1558	unsigned long flags;
1559	int i;
1560	struct rvt_qp *fqp;
1561
1562	lockdep_assert_held(&req->qp->s_lock);
1563	/ Exit if we have nothing in the flow circular buffer /
1564	if (!CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS))
1565	return -EINVAL;
1566
1567	spin_lock_irqsave(&rcd->exp_lock, flags);
1568
1569	for (i = `0`; i < flow->tnode_cnt; i++)
1570	kern_unprogram_rcv_group(flow, grp_num: i);
1571	/ To prevent double unprogramming /
1572	flow->tnode_cnt = `0`;
1573	/ get head before dropping lock /
1574	fqp = first_qp(rcd, queue: &rcd->rarr_queue);
1575	spin_unlock_irqrestore(lock: &rcd->exp_lock, flags);
1576
1577	dma_unmap_flow(flow);
1578
1579	hfi1_tid_rdma_reset_flow(flow);
1580	req->clear_tail = (req->clear_tail + `1`) & (MAX_FLOWS - `1`);
1581
1582	if (fqp == req->qp) {
1583	__trigger_tid_waiter(qp: fqp);
1584	rvt_put_qp(qp: fqp);
1585	} else {
1586	tid_rdma_schedule_tid_wakeup(qp: fqp);
1587	}
1588
1589	return `0`;
1590	}
1591
1592	/*
1593	* This function is called to release all the tid entries for
1594	* a request.
1595	*/
1596	void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req)
1597	__must_hold(&req->qp->s_lock)
1598	{
1599	/ Use memory barrier for proper ordering /
1600	while (CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) {
1601	if (hfi1_kern_exp_rcv_clear(req))
1602	break;
1603	}
1604	}
1605
1606	/**
1607	* hfi1_kern_exp_rcv_free_flows - free previously allocated flow information
1608	* @req: the tid rdma request to be cleaned
1609	*/
1610	static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req)
1611	{
1612	kfree(objp: req->flows);
1613	req->flows = NULL;
1614	}
1615
1616	/**
1617	* __trdma_clean_swqe - clean up for large sized QPs
1618	* @qp: the queue patch
1619	* @wqe: the send wqe
1620	*/
1621	void __trdma_clean_swqe(struct rvt_qp qp, struct* rvt_swqe *wqe)
1622	{
1623	struct hfi1_swqe_priv *p = wqe->priv;
1624
1625	hfi1_kern_exp_rcv_free_flows(req: &p->tid_req);
1626	}
1627
1628	/*
1629	* This can be called at QP create time or in the data path.
1630	*/
1631	static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
1632	gfp_t gfp)
1633	{
1634	struct tid_rdma_flow *flows;
1635	int i;
1636
1637	if (likely(req->flows))
1638	return `0`;
1639	flows = kmalloc_node(MAX_FLOWS * sizeof(*flows), flags: gfp,
1640	node: req->rcd->numa_id);
1641	if (!flows)
1642	return -ENOMEM;
1643	/ mini init /
1644	for (i = `0`; i < MAX_FLOWS; i++) {
1645	flows[i].req = req;
1646	flows[i].npagesets = `0`;
1647	flows[i].pagesets[`0`].mapped = `0`;
1648	flows[i].resync_npkts = `0`;
1649	}
1650	req->flows = flows;
1651	return `0`;
1652	}
1653
1654	static void hfi1_init_trdma_req(struct rvt_qp *qp,
1655	struct tid_rdma_request *req)
1656	{
1657	struct hfi1_qp_priv *qpriv = qp->priv;
1658
1659	/*
1660	* Initialize various TID RDMA request variables.
1661	* These variables are "static", which is why they
1662	* can be pre-initialized here before the WRs has
1663	* even been submitted.
1664	* However, non-NULL values for these variables do not
1665	* imply that this WQE has been enabled for TID RDMA.
1666	* Drivers should check the WQE's opcode to determine
1667	* if a request is a TID RDMA one or not.
1668	*/
1669	req->qp = qp;
1670	req->rcd = qpriv->rcd;
1671	}
1672
1673	u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry,
1674	void context, int* vl, int mode, u64 data)
1675	{
1676	struct hfi1_devdata *dd = context;
1677
1678	return dd->verbs_dev.n_tidwait;
1679	}
1680
1681	static struct tid_rdma_flow find_flow_ib(struct* tid_rdma_request *req,
1682	u32 psn, u16 *fidx)
1683	{
1684	u16 head, tail;
1685	struct tid_rdma_flow *flow;
1686
1687	head = req->setup_head;
1688	tail = req->clear_tail;
1689	for ( ; CIRC_CNT(head, tail, MAX_FLOWS);
1690	tail = CIRC_NEXT(tail, MAX_FLOWS)) {
1691	flow = &req->flows[tail];
1692	if (cmp_psn(a: psn, b: flow->flow_state.ib_spsn) >= `0` &&
1693	cmp_psn(a: psn, b: flow->flow_state.ib_lpsn) <= `0`) {
1694	if (fidx)
1695	*fidx = tail;
1696	return flow;
1697	}
1698	}
1699	return NULL;
1700	}
1701
1702	/ TID RDMA READ functions /
1703	u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe,
1704	struct ib_other_headers ohdr, u32 bth1,
1705	u32 bth2, u32 len)
1706	{
1707	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
1708	struct tid_rdma_flow *flow = &req->flows[req->flow_idx];
1709	struct rvt_qp *qp = req->qp;
1710	struct hfi1_qp_priv *qpriv = qp->priv;
1711	struct hfi1_swqe_priv *wpriv = wqe->priv;
1712	struct tid_rdma_read_req *rreq = &ohdr->u.tid_rdma.r_req;
1713	struct tid_rdma_params *remote;
1714	u32 req_len = `0`;
1715	void *req_addr = NULL;
1716
1717	/ This is the IB psn used to send the request /
1718	*bth2 = mask_psn(a: flow->flow_state.ib_spsn + flow->pkt);
1719	trace_hfi1_tid_flow_build_read_pkt(qp, index: req->flow_idx, flow);
1720
1721	/ TID Entries for TID RDMA READ payload /
1722	req_addr = &flow->tid_entry[flow->tid_idx];
1723	req_len = sizeof(flow->tid_entry)
1724	(flow->tidcnt - flow->tid_idx);
1725
1726	memset(&ohdr->u.tid_rdma.r_req, `0`, sizeof(ohdr->u.tid_rdma.r_req));
1727	wpriv->ss.sge.vaddr = req_addr;
1728	wpriv->ss.sge.sge_length = req_len;
1729	wpriv->ss.sge.length = wpriv->ss.sge.sge_length;
1730	/*
1731	* We can safely zero these out. Since the first SGE covers the
1732	* entire packet, nothing else should even look at the MR.
1733	*/
1734	wpriv->ss.sge.mr = NULL;
1735	wpriv->ss.sge.m = `0`;
1736	wpriv->ss.sge.n = `0`;
1737
1738	wpriv->ss.sg_list = NULL;
1739	wpriv->ss.total_len = wpriv->ss.sge.sge_length;
1740	wpriv->ss.num_sge = `1`;
1741
1742	/ Construct the TID RDMA READ REQ packet header /
1743	rcu_read_lock();
1744	remote = rcu_dereference(qpriv->tid_rdma.remote);
1745
1746	KDETH_RESET(rreq->kdeth0, KVER, `0x1`);
1747	KDETH_RESET(rreq->kdeth1, JKEY, remote->jkey);
1748	rreq->reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr +
1749	req->cur_seg * req->seg_len + flow->sent);
1750	rreq->reth.rkey = cpu_to_be32(wqe->rdma_wr.rkey);
1751	rreq->reth.length = cpu_to_be32(*len);
1752	rreq->tid_flow_psn =
1753	cpu_to_be32((flow->flow_state.generation <<
1754	HFI1_KDETH_BTH_SEQ_SHIFT) \|
1755	((flow->flow_state.spsn + flow->pkt) &
1756	HFI1_KDETH_BTH_SEQ_MASK));
1757	rreq->tid_flow_qp =
1758	cpu_to_be32(qpriv->tid_rdma.local.qp \|
1759	((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
1760	TID_RDMA_DESTQP_FLOW_SHIFT) \|
1761	qpriv->rcd->ctxt);
1762	rreq->verbs_qp = cpu_to_be32(qp->remote_qpn);
1763	*bth1 &= ~RVT_QPN_MASK;
1764	*bth1 \|= remote->qp;
1765	*bth2 \|= IB_BTH_REQ_ACK;
1766	rcu_read_unlock();
1767
1768	/ We are done with this segment /
1769	flow->sent += *len;
1770	req->cur_seg++;
1771	qp->s_state = TID_OP(READ_REQ);
1772	req->ack_pending++;
1773	req->flow_idx = (req->flow_idx + `1`) & (MAX_FLOWS - `1`);
1774	qpriv->pending_tid_r_segs++;
1775	qp->s_num_rd_atomic++;
1776
1777	/ Set the TID RDMA READ request payload size /
1778	*len = req_len;
1779
1780	return sizeof(ohdr->u.tid_rdma.r_req) / sizeof(u32);
1781	}
1782
1783	/*
1784	* @len: contains the data length to read upon entry and the read request
1785	* payload length upon exit.
1786	*/
1787	u32 hfi1_build_tid_rdma_read_req(struct rvt_qp qp, struct* rvt_swqe *wqe,
1788	struct ib_other_headers ohdr, u32 bth1,
1789	u32 bth2, u32 len)
1790	__must_hold(&qp->s_lock)
1791	{
1792	struct hfi1_qp_priv *qpriv = qp->priv;
1793	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
1794	struct tid_rdma_flow *flow = NULL;
1795	u32 hdwords = `0`;
1796	bool last;
1797	bool retry = true;
1798	u32 npkts = rvt_div_round_up_mtu(qp, len: *len);
1799
1800	trace_hfi1_tid_req_build_read_req(qp, newreq: `0`, opcode: wqe->wr.opcode, psn: wqe->psn,
1801	lpsn: wqe->lpsn, req);
1802	/*
1803	* Check sync conditions. Make sure that there are no pending
1804	* segments before freeing the flow.
1805	*/
1806	sync_check:
1807	if (req->state == TID_REQUEST_SYNC) {
1808	if (qpriv->pending_tid_r_segs)
1809	goto done;
1810
1811	hfi1_kern_clear_hw_flow(rcd: req->rcd, qp);
1812	qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
1813	req->state = TID_REQUEST_ACTIVE;
1814	}
1815
1816	/*
1817	* If the request for this segment is resent, the tid resources should
1818	* have been allocated before. In this case, req->flow_idx should
1819	* fall behind req->setup_head.
1820	*/
1821	if (req->flow_idx == req->setup_head) {
1822	retry = false;
1823	if (req->state == TID_REQUEST_RESEND) {
1824	/*
1825	* This is the first new segment for a request whose
1826	* earlier segments have been re-sent. We need to
1827	* set up the sge pointer correctly.
1828	*/
1829	restart_sge(ss: &qp->s_sge, wqe, psn: req->s_next_psn,
1830	pmtu: qp->pmtu);
1831	req->isge = `0`;
1832	req->state = TID_REQUEST_ACTIVE;
1833	}
1834
1835	/*
1836	* Check sync. The last PSN of each generation is reserved for
1837	* RESYNC.
1838	*/
1839	if ((qpriv->flow_state.psn + npkts) > MAX_TID_FLOW_PSN - `1`) {
1840	req->state = TID_REQUEST_SYNC;
1841	goto sync_check;
1842	}
1843
1844	/ Allocate the flow if not yet /
1845	if (hfi1_kern_setup_hw_flow(rcd: qpriv->rcd, qp))
1846	goto done;
1847
1848	/*
1849	* The following call will advance req->setup_head after
1850	* allocating the tid entries.
1851	*/
1852	if (hfi1_kern_exp_rcv_setup(req, ss: &qp->s_sge, last: &last)) {
1853	req->state = TID_REQUEST_QUEUED;
1854
1855	/*
1856	* We don't have resources for this segment. The QP has
1857	* already been queued.
1858	*/
1859	goto done;
1860	}
1861	}
1862
1863	/ req->flow_idx should only be one slot behind req->setup_head /
1864	flow = &req->flows[req->flow_idx];
1865	flow->pkt = `0`;
1866	flow->tid_idx = `0`;
1867	flow->sent = `0`;
1868	if (!retry) {
1869	/ Set the first and last IB PSN for the flow in use./
1870	flow->flow_state.ib_spsn = req->s_next_psn;
1871	flow->flow_state.ib_lpsn =
1872	flow->flow_state.ib_spsn + flow->npkts - `1`;
1873	}
1874
1875	/ Calculate the next segment start psn./
1876	req->s_next_psn += flow->npkts;
1877
1878	/ Build the packet header /
1879	hdwords = hfi1_build_tid_rdma_read_packet(wqe, ohdr, bth1, bth2, len);
1880	done:
1881	return hdwords;
1882	}
1883
1884	/*
1885	* Validate and accept the TID RDMA READ request parameters.
1886	* Return 0 if the request is accepted successfully;
1887	* Return 1 otherwise.
1888	*/
1889	static int tid_rdma_rcv_read_request(struct rvt_qp *qp,
1890	struct rvt_ack_entry *e,
1891	struct hfi1_packet *packet,
1892	struct ib_other_headers *ohdr,
1893	u32 bth0, u32 psn, u64 vaddr, u32 len)
1894	{
1895	struct hfi1_qp_priv *qpriv = qp->priv;
1896	struct tid_rdma_request *req;
1897	struct tid_rdma_flow *flow;
1898	u32 flow_psn, i, tidlen = `0`, pktlen, tlen;
1899
1900	req = ack_to_tid_req(e);
1901
1902	/ Validate the payload first /
1903	flow = &req->flows[req->setup_head];
1904
1905	/ payload length = packet length - (header length + ICRC length) /
1906	pktlen = packet->tlen - (packet->hlen + `4`);
1907	if (pktlen > sizeof(flow->tid_entry))
1908	return `1`;
1909	memcpy(flow->tid_entry, packet->ebuf, pktlen);
1910	flow->tidcnt = pktlen / sizeof(*flow->tid_entry);
1911
1912	/*
1913	* Walk the TID_ENTRY list to make sure we have enough space for a
1914	* complete segment. Also calculate the number of required packets.
1915	*/
1916	flow->npkts = rvt_div_round_up_mtu(qp, len);
1917	for (i = `0`; i < flow->tidcnt; i++) {
1918	trace_hfi1_tid_entry_rcv_read_req(qp, index: i,
1919	ent: flow->tid_entry[i]);
1920	tlen = EXP_TID_GET(flow->tid_entry[i], LEN);
1921	if (!tlen)
1922	return `1`;
1923
1924	/*
1925	* For tid pair (tidctr == 3), the buffer size of the pair
1926	* should be the sum of the buffer size described by each
1927	* tid entry. However, only the first entry needs to be
1928	* specified in the request (see WFR HAS Section 8.5.7.1).
1929	*/
1930	tidlen += tlen;
1931	}
1932	if (tidlen * PAGE_SIZE < len)
1933	return `1`;
1934
1935	/ Empty the flow array /
1936	req->clear_tail = req->setup_head;
1937	flow->pkt = `0`;
1938	flow->tid_idx = `0`;
1939	flow->tid_offset = `0`;
1940	flow->sent = `0`;
1941	flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_qp);
1942	flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) &
1943	TID_RDMA_DESTQP_FLOW_MASK;
1944	flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_psn));
1945	flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
1946	flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK;
1947	flow->length = len;
1948
1949	flow->flow_state.lpsn = flow->flow_state.spsn +
1950	flow->npkts - `1`;
1951	flow->flow_state.ib_spsn = psn;
1952	flow->flow_state.ib_lpsn = flow->flow_state.ib_spsn + flow->npkts - `1`;
1953
1954	trace_hfi1_tid_flow_rcv_read_req(qp, index: req->setup_head, flow);
1955	/ Set the initial flow index to the current flow. /
1956	req->flow_idx = req->setup_head;
1957
1958	/ advance circular buffer head /
1959	req->setup_head = (req->setup_head + `1`) & (MAX_FLOWS - `1`);
1960
1961	/*
1962	* Compute last PSN for request.
1963	*/
1964	e->opcode = (bth0 >> `24`) & `0xff`;
1965	e->psn = psn;
1966	e->lpsn = psn + flow->npkts - `1`;
1967	e->sent = `0`;
1968
1969	req->n_flows = qpriv->tid_rdma.local.max_read;
1970	req->state = TID_REQUEST_ACTIVE;
1971	req->cur_seg = `0`;
1972	req->comp_seg = `0`;
1973	req->ack_seg = `0`;
1974	req->isge = `0`;
1975	req->seg_len = qpriv->tid_rdma.local.max_len;
1976	req->total_len = len;
1977	req->total_segs = `1`;
1978	req->r_flow_psn = e->psn;
1979
1980	trace_hfi1_tid_req_rcv_read_req(qp, newreq: `0`, opcode: e->opcode, psn: e->psn, lpsn: e->lpsn,
1981	req);
1982	return `0`;
1983	}
1984
1985	static int tid_rdma_rcv_error(struct hfi1_packet *packet,
1986	struct ib_other_headers *ohdr,
1987	struct rvt_qp qp, u32 psn, int* diff)
1988	{
1989	struct hfi1_ibport *ibp = to_iport(ibdev: qp->ibqp.device, port: qp->port_num);
1990	struct hfi1_ctxtdata rcd = ((struct* hfi1_qp_priv *)qp->priv)->rcd;
1991	struct hfi1_ibdev *dev = to_idev(ibdev: qp->ibqp.device);
1992	struct hfi1_qp_priv *qpriv = qp->priv;
1993	struct rvt_ack_entry *e;
1994	struct tid_rdma_request *req;
1995	unsigned long flags;
1996	u8 prev;
1997	bool old_req;
1998
1999	trace_hfi1_rsp_tid_rcv_error(qp, psn);
2000	trace_hfi1_tid_rdma_rcv_err(qp, opcode: `0`, psn, diff);
2001	if (diff > `0`) {
2002	/ sequence error /
2003	if (!qp->r_nak_state) {
2004	ibp->rvp.n_rc_seqnak++;
2005	qp->r_nak_state = IB_NAK_PSN_ERROR;
2006	qp->r_ack_psn = qp->r_psn;
2007	rc_defered_ack(rcd, qp);
2008	}
2009	goto done;
2010	}
2011
2012	ibp->rvp.n_rc_dupreq++;
2013
2014	spin_lock_irqsave(&qp->s_lock, flags);
2015	e = find_prev_entry(qp, psn, prev: &prev, NULL, scheduled: &old_req);
2016	if (!e \|\| (e->opcode != TID_OP(READ_REQ) &&
2017	e->opcode != TID_OP(WRITE_REQ)))
2018	goto unlock;
2019
2020	req = ack_to_tid_req(e);
2021	req->r_flow_psn = psn;
2022	trace_hfi1_tid_req_rcv_err(qp, newreq: `0`, opcode: e->opcode, psn: e->psn, lpsn: e->lpsn, req);
2023	if (e->opcode == TID_OP(READ_REQ)) {
2024	struct ib_reth *reth;
2025	u32 len;
2026	u32 rkey;
2027	u64 vaddr;
2028	int ok;
2029	u32 bth0;
2030
2031	reth = &ohdr->u.tid_rdma.r_req.reth;
2032	/*
2033	* The requester always restarts from the start of the original
2034	* request.
2035	*/
2036	len = be32_to_cpu(reth->length);
2037	if (psn != e->psn \|\| len != req->total_len)
2038	goto unlock;
2039
2040	release_rdma_sge_mr(e);
2041
2042	rkey = be32_to_cpu(reth->rkey);
2043	vaddr = get_ib_reth_vaddr(reth);
2044
2045	qp->r_len = len;
2046	ok = rvt_rkey_ok(qp, sge: &e->rdma_sge, len, vaddr, rkey,
2047	acc: IB_ACCESS_REMOTE_READ);
2048	if (unlikely(!ok))
2049	goto unlock;
2050
2051	/*
2052	* If all the response packets for the current request have
2053	* been sent out and this request is complete (old_request
2054	* == false) and the TID flow may be unusable (the
2055	* req->clear_tail is advanced). However, when an earlier
2056	* request is received, this request will not be complete any
2057	* more (qp->s_tail_ack_queue is moved back, see below).
2058	* Consequently, we need to update the TID flow info every time
2059	* a duplicate request is received.
2060	*/
2061	bth0 = be32_to_cpu(ohdr->bth[`0`]);
2062	if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn,
2063	vaddr, len))
2064	goto unlock;
2065
2066	/*
2067	* True if the request is already scheduled (between
2068	* qp->s_tail_ack_queue and qp->r_head_ack_queue);
2069	*/
2070	if (old_req)
2071	goto unlock;
2072	} else {
2073	struct flow_state *fstate;
2074	bool schedule = false;
2075	u8 i;
2076
2077	if (req->state == TID_REQUEST_RESEND) {
2078	req->state = TID_REQUEST_RESEND_ACTIVE;
2079	} else if (req->state == TID_REQUEST_INIT_RESEND) {
2080	req->state = TID_REQUEST_INIT;
2081	schedule = true;
2082	}
2083
2084	/*
2085	* True if the request is already scheduled (between
2086	* qp->s_tail_ack_queue and qp->r_head_ack_queue).
2087	* Also, don't change requests, which are at the SYNC
2088	* point and haven't generated any responses yet.
2089	* There is nothing to retransmit for them yet.
2090	*/
2091	if (old_req \|\| req->state == TID_REQUEST_INIT \|\|
2092	(req->state == TID_REQUEST_SYNC && !req->cur_seg)) {
2093	for (i = prev + `1`; ; i++) {
2094	if (i > rvt_size_atomic(rdi: &dev->rdi))
2095	i = `0`;
2096	if (i == qp->r_head_ack_queue)
2097	break;
2098	e = &qp->s_ack_queue[i];
2099	req = ack_to_tid_req(e);
2100	if (e->opcode == TID_OP(WRITE_REQ) &&
2101	req->state == TID_REQUEST_INIT)
2102	req->state = TID_REQUEST_INIT_RESEND;
2103	}
2104	/*
2105	* If the state of the request has been changed,
2106	* the first leg needs to get scheduled in order to
2107	* pick up the change. Otherwise, normal response
2108	* processing should take care of it.
2109	*/
2110	if (!schedule)
2111	goto unlock;
2112	}
2113
2114	/*
2115	* If there is no more allocated segment, just schedule the qp
2116	* without changing any state.
2117	*/
2118	if (req->clear_tail == req->setup_head)
2119	goto schedule;
2120	/*
2121	* If this request has sent responses for segments, which have
2122	* not received data yet (flow_idx != clear_tail), the flow_idx
2123	* pointer needs to be adjusted so the same responses can be
2124	* re-sent.
2125	*/
2126	if (CIRC_CNT(req->flow_idx, req->clear_tail, MAX_FLOWS)) {
2127	fstate = &req->flows[req->clear_tail].flow_state;
2128	qpriv->pending_tid_w_segs -=
2129	CIRC_CNT(req->flow_idx, req->clear_tail,
2130	MAX_FLOWS);
2131	req->flow_idx =
2132	CIRC_ADD(req->clear_tail,
2133	delta_psn(psn, fstate->resp_ib_psn),
2134	MAX_FLOWS);
2135	qpriv->pending_tid_w_segs +=
2136	delta_psn(a: psn, b: fstate->resp_ib_psn);
2137	/*
2138	* When flow_idx == setup_head, we've gotten a duplicate
2139	* request for a segment, which has not been allocated
2140	* yet. In that case, don't adjust this request.
2141	* However, we still want to go through the loop below
2142	* to adjust all subsequent requests.
2143	*/
2144	if (CIRC_CNT(req->setup_head, req->flow_idx,
2145	MAX_FLOWS)) {
2146	req->cur_seg = delta_psn(a: psn, b: e->psn);
2147	req->state = TID_REQUEST_RESEND_ACTIVE;
2148	}
2149	}
2150
2151	for (i = prev + `1`; ; i++) {
2152	/*
2153	* Look at everything up to and including
2154	* s_tail_ack_queue
2155	*/
2156	if (i > rvt_size_atomic(rdi: &dev->rdi))
2157	i = `0`;
2158	if (i == qp->r_head_ack_queue)
2159	break;
2160	e = &qp->s_ack_queue[i];
2161	req = ack_to_tid_req(e);
2162	trace_hfi1_tid_req_rcv_err(qp, newreq: `0`, opcode: e->opcode, psn: e->psn,
2163	lpsn: e->lpsn, req);
2164	if (e->opcode != TID_OP(WRITE_REQ) \|\|
2165	req->cur_seg == req->comp_seg \|\|
2166	req->state == TID_REQUEST_INIT \|\|
2167	req->state == TID_REQUEST_INIT_RESEND) {
2168	if (req->state == TID_REQUEST_INIT)
2169	req->state = TID_REQUEST_INIT_RESEND;
2170	continue;
2171	}
2172	qpriv->pending_tid_w_segs -=
2173	CIRC_CNT(req->flow_idx,
2174	req->clear_tail,
2175	MAX_FLOWS);
2176	req->flow_idx = req->clear_tail;
2177	req->state = TID_REQUEST_RESEND;
2178	req->cur_seg = req->comp_seg;
2179	}
2180	qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK;
2181	}
2182	/ Re-process old requests./
2183	if (qp->s_acked_ack_queue == qp->s_tail_ack_queue)
2184	qp->s_acked_ack_queue = prev;
2185	qp->s_tail_ack_queue = prev;
2186	/*
2187	* Since the qp->s_tail_ack_queue is modified, the
2188	* qp->s_ack_state must be changed to re-initialize
2189	* qp->s_ack_rdma_sge; Otherwise, we will end up in
2190	* wrong memory region.
2191	*/
2192	qp->s_ack_state = OP(ACKNOWLEDGE);
2193	schedule:
2194	/*
2195	* It's possible to receive a retry psn that is earlier than an RNRNAK
2196	* psn. In this case, the rnrnak state should be cleared.
2197	*/
2198	if (qpriv->rnr_nak_state) {
2199	qp->s_nak_state = `0`;
2200	qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
2201	qp->r_psn = e->lpsn + `1`;
2202	hfi1_tid_write_alloc_resources(qp, intr_ctx: true);
2203	}
2204
2205	qp->r_state = e->opcode;
2206	qp->r_nak_state = `0`;
2207	qp->s_flags \|= RVT_S_RESP_PENDING;
2208	hfi1_schedule_send(qp);
2209	unlock:
2210	spin_unlock_irqrestore(lock: &qp->s_lock, flags);
2211	done:
2212	return `1`;
2213	}
2214
2215	void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
2216	{
2217	/ HANDLER FOR TID RDMA READ REQUEST packet (Responder side)/
2218
2219	/*
2220	* 1. Verify TID RDMA READ REQ as per IB_OPCODE_RC_RDMA_READ
2221	* (see hfi1_rc_rcv())
2222	* 2. Put TID RDMA READ REQ into the response queue (s_ack_queue)
2223	* - Setup struct tid_rdma_req with request info
2224	* - Initialize struct tid_rdma_flow info;
2225	* - Copy TID entries;
2226	* 3. Set the qp->s_ack_state.
2227	* 4. Set RVT_S_RESP_PENDING in s_flags.
2228	* 5. Kick the send engine (hfi1_schedule_send())
2229	*/
2230	struct hfi1_ctxtdata *rcd = packet->rcd;
2231	struct rvt_qp *qp = packet->qp;
2232	struct hfi1_ibport *ibp = to_iport(ibdev: qp->ibqp.device, port: qp->port_num);
2233	struct ib_other_headers *ohdr = packet->ohdr;
2234	struct rvt_ack_entry *e;
2235	unsigned long flags;
2236	struct ib_reth *reth;
2237	struct hfi1_qp_priv *qpriv = qp->priv;
2238	u32 bth0, psn, len, rkey;
2239	bool fecn;
2240	u8 next;
2241	u64 vaddr;
2242	int diff;
2243	u8 nack_state = IB_NAK_INVALID_REQUEST;
2244
2245	bth0 = be32_to_cpu(ohdr->bth[`0`]);
2246	if (hfi1_ruc_check_hdr(ibp, packet))
2247	return;
2248
2249	fecn = process_ecn(qp, pkt: packet);
2250	psn = mask_psn(be32_to_cpu(ohdr->bth[`2`]));
2251	trace_hfi1_rsp_rcv_tid_read_req(qp, psn);
2252
2253	if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
2254	rvt_comm_est(qp);
2255
2256	if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
2257	goto nack_inv;
2258
2259	reth = &ohdr->u.tid_rdma.r_req.reth;
2260	vaddr = be64_to_cpu(reth->vaddr);
2261	len = be32_to_cpu(reth->length);
2262	/ The length needs to be in multiples of PAGE_SIZE /
2263	if (!len \|\| len & ~PAGE_MASK \|\| len > qpriv->tid_rdma.local.max_len)
2264	goto nack_inv;
2265
2266	diff = delta_psn(a: psn, b: qp->r_psn);
2267	if (unlikely(diff)) {
2268	tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn);
2269	return;
2270	}
2271
2272	/ We've verified the request, insert it into the ack queue. /
2273	next = qp->r_head_ack_queue + `1`;
2274	if (next > rvt_size_atomic(rdi: ib_to_rvt(ibdev: qp->ibqp.device)))
2275	next = `0`;
2276	spin_lock_irqsave(&qp->s_lock, flags);
2277	if (unlikely(next == qp->s_tail_ack_queue)) {
2278	if (!qp->s_ack_queue[next].sent) {
2279	nack_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
2280	goto nack_inv_unlock;
2281	}
2282	update_ack_queue(qp, n: next);
2283	}
2284	e = &qp->s_ack_queue[qp->r_head_ack_queue];
2285	release_rdma_sge_mr(e);
2286
2287	rkey = be32_to_cpu(reth->rkey);
2288	qp->r_len = len;
2289
2290	if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr,
2291	rkey, IB_ACCESS_REMOTE_READ)))
2292	goto nack_acc;
2293
2294	/ Accept the request parameters /
2295	if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, vaddr,
2296	len))
2297	goto nack_inv_unlock;
2298
2299	qp->r_state = e->opcode;
2300	qp->r_nak_state = `0`;
2301	/*
2302	* We need to increment the MSN here instead of when we
2303	* finish sending the result since a duplicate request would
2304	* increment it more than once.
2305	*/
2306	qp->r_msn++;
2307	qp->r_psn += e->lpsn - e->psn + `1`;
2308
2309	qp->r_head_ack_queue = next;
2310
2311	/*
2312	* For all requests other than TID WRITE which are added to the ack
2313	* queue, qpriv->r_tid_alloc follows qp->r_head_ack_queue. It is ok to
2314	* do this because of interlocks between these and TID WRITE
2315	* requests. The same change has also been made in hfi1_rc_rcv().
2316	*/
2317	qpriv->r_tid_alloc = qp->r_head_ack_queue;
2318
2319	/ Schedule the send tasklet. /
2320	qp->s_flags \|= RVT_S_RESP_PENDING;
2321	if (fecn)
2322	qp->s_flags \|= RVT_S_ECN;
2323	hfi1_schedule_send(qp);
2324
2325	spin_unlock_irqrestore(lock: &qp->s_lock, flags);
2326	return;
2327
2328	nack_inv_unlock:
2329	spin_unlock_irqrestore(lock: &qp->s_lock, flags);
2330	nack_inv:
2331	rvt_rc_error(qp, err: IB_WC_LOC_QP_OP_ERR);
2332	qp->r_nak_state = nack_state;
2333	qp->r_ack_psn = qp->r_psn;
2334	/ Queue NAK for later /
2335	rc_defered_ack(rcd, qp);
2336	return;
2337	nack_acc:
2338	spin_unlock_irqrestore(lock: &qp->s_lock, flags);
2339	rvt_rc_error(qp, err: IB_WC_LOC_PROT_ERR);
2340	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
2341	qp->r_ack_psn = qp->r_psn;
2342	}
2343
2344	u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp qp, struct* rvt_ack_entry *e,
2345	struct ib_other_headers ohdr, u32 bth0,
2346	u32 bth1, u32 bth2, u32 len, bool last)
2347	{
2348	struct hfi1_ack_priv *epriv = e->priv;
2349	struct tid_rdma_request *req = &epriv->tid_req;
2350	struct hfi1_qp_priv *qpriv = qp->priv;
2351	struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
2352	u32 tidentry = flow->tid_entry[flow->tid_idx];
2353	u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT;
2354	struct tid_rdma_read_resp *resp = &ohdr->u.tid_rdma.r_rsp;
2355	u32 next_offset, om = KDETH_OM_LARGE;
2356	bool last_pkt;
2357	u32 hdwords = `0`;
2358	struct tid_rdma_params *remote;
2359
2360	*len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset);
2361	flow->sent += *len;
2362	next_offset = flow->tid_offset + *len;
2363	last_pkt = (flow->sent >= flow->length);
2364
2365	trace_hfi1_tid_entry_build_read_resp(qp, index: flow->tid_idx, ent: tidentry);
2366	trace_hfi1_tid_flow_build_read_resp(qp, index: req->clear_tail, flow);
2367
2368	rcu_read_lock();
2369	remote = rcu_dereference(qpriv->tid_rdma.remote);
2370	if (!remote) {
2371	rcu_read_unlock();
2372	goto done;
2373	}
2374	KDETH_RESET(resp->kdeth0, KVER, `0x1`);
2375	KDETH_SET(resp->kdeth0, SH, !last_pkt);
2376	KDETH_SET(resp->kdeth0, INTR, !!(!last_pkt && remote->urg));
2377	KDETH_SET(resp->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL));
2378	KDETH_SET(resp->kdeth0, TID, EXP_TID_GET(tidentry, IDX));
2379	KDETH_SET(resp->kdeth0, OM, om == KDETH_OM_LARGE);
2380	KDETH_SET(resp->kdeth0, OFFSET, flow->tid_offset / om);
2381	KDETH_RESET(resp->kdeth1, JKEY, remote->jkey);
2382	resp->verbs_qp = cpu_to_be32(qp->remote_qpn);
2383	rcu_read_unlock();
2384
2385	resp->aeth = rvt_compute_aeth(qp);
2386	resp->verbs_psn = cpu_to_be32(mask_psn(flow->flow_state.ib_spsn +
2387	flow->pkt));
2388
2389	*bth0 = TID_OP(READ_RESP) << `24`;
2390	*bth1 = flow->tid_qpn;
2391	*bth2 = mask_psn(a: ((flow->flow_state.spsn + flow->pkt++) &
2392	HFI1_KDETH_BTH_SEQ_MASK) \|
2393	(flow->flow_state.generation <<
2394	HFI1_KDETH_BTH_SEQ_SHIFT));
2395	*last = last_pkt;
2396	if (last_pkt)
2397	/ Advance to next flow /
2398	req->clear_tail = (req->clear_tail + `1`) &
2399	(MAX_FLOWS - `1`);
2400
2401	if (next_offset >= tidlen) {
2402	flow->tid_offset = `0`;
2403	flow->tid_idx++;
2404	} else {
2405	flow->tid_offset = next_offset;
2406	}
2407
2408	hdwords = sizeof(ohdr->u.tid_rdma.r_rsp) / sizeof(u32);
2409
2410	done:
2411	return hdwords;
2412	}
2413
2414	static inline struct tid_rdma_request *
2415	find_tid_request(struct rvt_qp qp, u32 psn, enum* ib_wr_opcode opcode)
2416	__must_hold(&qp->s_lock)
2417	{
2418	struct rvt_swqe *wqe;
2419	struct tid_rdma_request *req = NULL;
2420	u32 i, end;
2421
2422	end = qp->s_cur + `1`;
2423	if (end == qp->s_size)
2424	end = `0`;
2425	for (i = qp->s_acked; i != end;) {
2426	wqe = rvt_get_swqe_ptr(qp, n: i);
2427	if (cmp_psn(a: psn, b: wqe->psn) >= `0` &&
2428	cmp_psn(a: psn, b: wqe->lpsn) <= `0`) {
2429	if (wqe->wr.opcode == opcode)
2430	req = wqe_to_tid_req(wqe);
2431	break;
2432	}
2433	if (++i == qp->s_size)
2434	i = `0`;
2435	}
2436
2437	return req;
2438	}
2439
2440	void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet)
2441	{
2442	/ HANDLER FOR TID RDMA READ RESPONSE packet (Requester side) /
2443
2444	/*
2445	* 1. Find matching SWQE
2446	* 2. Check that the entire segment has been read.
2447	* 3. Remove HFI1_S_WAIT_TID_RESP from s_flags.
2448	* 4. Free the TID flow resources.
2449	* 5. Kick the send engine (hfi1_schedule_send())
2450	*/
2451	struct ib_other_headers *ohdr = packet->ohdr;
2452	struct rvt_qp *qp = packet->qp;
2453	struct hfi1_qp_priv *priv = qp->priv;
2454	struct hfi1_ctxtdata *rcd = packet->rcd;
2455	struct tid_rdma_request *req;
2456	struct tid_rdma_flow *flow;
2457	u32 opcode, aeth;
2458	bool fecn;
2459	unsigned long flags;
2460	u32 kpsn, ipsn;
2461
2462	trace_hfi1_sender_rcv_tid_read_resp(qp);
2463	fecn = process_ecn(qp, pkt: packet);
2464	kpsn = mask_psn(be32_to_cpu(ohdr->bth[`2`]));
2465	aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth);
2466	opcode = (be32_to_cpu(ohdr->bth[`0`]) >> `24`) & `0xff`;
2467
2468	spin_lock_irqsave(&qp->s_lock, flags);
2469	ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn));
2470	req = find_tid_request(qp, psn: ipsn, IB_WR_TID_RDMA_READ);
2471	if (unlikely(!req))
2472	goto ack_op_err;
2473
2474	flow = &req->flows[req->clear_tail];
2475	/ When header suppression is disabled /
2476	if (cmp_psn(a: ipsn, b: flow->flow_state.ib_lpsn)) {
2477	update_r_next_psn_fecn(packet, priv, rcd, flow, fecn);
2478
2479	if (cmp_psn(a: kpsn, b: flow->flow_state.r_next_psn))
2480	goto ack_done;
2481	flow->flow_state.r_next_psn = mask_psn(a: kpsn + `1`);
2482	/*
2483	* Copy the payload to destination buffer if this packet is
2484	* delivered as an eager packet due to RSM rule and FECN.
2485	* The RSM rule selects FECN bit in BTH and SH bit in
2486	* KDETH header and therefore will not match the last
2487	* packet of each segment that has SH bit cleared.
2488	*/
2489	if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) {
2490	struct rvt_sge_state ss;
2491	u32 len;
2492	u32 tlen = packet->tlen;
2493	u16 hdrsize = packet->hlen;
2494	u8 pad = packet->pad;
2495	u8 extra_bytes = pad + packet->extra_byte +
2496	(SIZE_OF_CRC << `2`);
2497	u32 pmtu = qp->pmtu;
2498
2499	if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
2500	goto ack_op_err;
2501	len = restart_sge(ss: &ss, wqe: req->e.swqe, psn: ipsn, pmtu);
2502	if (unlikely(len < pmtu))
2503	goto ack_op_err;
2504	rvt_copy_sge(qp, ss: &ss, data: packet->payload, length: pmtu, release: false,
2505	copy_last: false);
2506	/ Raise the sw sequence check flag for next packet /
2507	priv->s_flags \|= HFI1_R_TID_SW_PSN;
2508	}
2509
2510	goto ack_done;
2511	}
2512	flow->flow_state.r_next_psn = mask_psn(a: kpsn + `1`);
2513	req->ack_pending--;
2514	priv->pending_tid_r_segs--;
2515	qp->s_num_rd_atomic--;
2516	if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
2517	!qp->s_num_rd_atomic) {
2518	qp->s_flags &= ~(RVT_S_WAIT_FENCE \|
2519	RVT_S_WAIT_ACK);
2520	hfi1_schedule_send(qp);
2521	}
2522	if (qp->s_flags & RVT_S_WAIT_RDMAR) {
2523	qp->s_flags &= ~(RVT_S_WAIT_RDMAR \| RVT_S_WAIT_ACK);
2524	hfi1_schedule_send(qp);
2525	}
2526
2527	trace_hfi1_ack(qp, psn: ipsn);
2528	trace_hfi1_tid_req_rcv_read_resp(qp, newreq: `0`, opcode: req->e.swqe->wr.opcode,
2529	psn: req->e.swqe->psn, lpsn: req->e.swqe->lpsn,
2530	req);
2531	trace_hfi1_tid_flow_rcv_read_resp(qp, index: req->clear_tail, flow);
2532
2533	/ Release the tid resources /
2534	hfi1_kern_exp_rcv_clear(req);
2535
2536	if (!do_rc_ack(qp, aeth, psn: ipsn, opcode, val: `0`, rcd))
2537	goto ack_done;
2538
2539	/ If not done yet, build next read request /
2540	if (++req->comp_seg >= req->total_segs) {
2541	priv->tid_r_comp++;
2542	req->state = TID_REQUEST_COMPLETE;
2543	}
2544
2545	/*
2546	* Clear the hw flow under two conditions:
2547	* 1. This request is a sync point and it is complete;
2548	* 2. Current request is completed and there are no more requests.
2549	*/
2550	if ((req->state == TID_REQUEST_SYNC &&
2551	req->comp_seg == req->cur_seg) \|\|
2552	priv->tid_r_comp == priv->tid_r_reqs) {
2553	hfi1_kern_clear_hw_flow(rcd: priv->rcd, qp);
2554	priv->s_flags &= ~HFI1_R_TID_SW_PSN;
2555	if (req->state == TID_REQUEST_SYNC)
2556	req->state = TID_REQUEST_ACTIVE;
2557	}
2558
2559	hfi1_schedule_send(qp);
2560	goto ack_done;
2561
2562	ack_op_err:
2563	/*
2564	* The test indicates that the send engine has finished its cleanup
2565	* after sending the request and it's now safe to put the QP into error
2566	* state. However, if the wqe queue is empty (qp->s_acked == qp->s_tail
2567	* == qp->s_head), it would be unsafe to complete the wqe pointed by
2568	* qp->s_acked here. Putting the qp into error state will safely flush
2569	* all remaining requests.
2570	*/
2571	if (qp->s_last == qp->s_acked)
2572	rvt_error_qp(qp, err: IB_WC_WR_FLUSH_ERR);
2573
2574	ack_done:
2575	spin_unlock_irqrestore(lock: &qp->s_lock, flags);
2576	}
2577
2578	void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp)
2579	__must_hold(&qp->s_lock)
2580	{
2581	u32 n = qp->s_acked;
2582	struct rvt_swqe *wqe;
2583	struct tid_rdma_request *req;
2584	struct hfi1_qp_priv *priv = qp->priv;
2585
2586	lockdep_assert_held(&qp->s_lock);
2587	/ Free any TID entries /
2588	while (n != qp->s_tail) {
2589	wqe = rvt_get_swqe_ptr(qp, n);
2590	if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
2591	req = wqe_to_tid_req(wqe);
2592	hfi1_kern_exp_rcv_clear_all(req);
2593	}
2594
2595	if (++n == qp->s_size)
2596	n = `0`;
2597	}
2598	/ Free flow /
2599	hfi1_kern_clear_hw_flow(rcd: priv->rcd, qp);
2600	}
2601
2602	static bool tid_rdma_tid_err(struct hfi1_packet *packet, u8 rcv_type)
2603	{
2604	struct rvt_qp *qp = packet->qp;
2605
2606	if (rcv_type >= RHF_RCV_TYPE_IB)
2607	goto done;
2608
2609	spin_lock(lock: &qp->s_lock);
2610
2611	/*
2612	* We've ran out of space in the eager buffer.
2613	* Eagerly received KDETH packets which require space in the
2614	* Eager buffer (packet that have payload) are TID RDMA WRITE
2615	* response packets. In this case, we have to re-transmit the
2616	* TID RDMA WRITE request.
2617	*/
2618	if (rcv_type == RHF_RCV_TYPE_EAGER) {
2619	hfi1_restart_rc(qp, psn: qp->s_last_psn + `1`, wait: `1`);
2620	hfi1_schedule_send(qp);
2621	}
2622
2623	/ Since no payload is delivered, just drop the packet /
2624	spin_unlock(lock: &qp->s_lock);
2625	done:
2626	return true;
2627	}
2628
2629	static void restart_tid_rdma_read_req(struct hfi1_ctxtdata *rcd,
2630	struct rvt_qp qp, struct* rvt_swqe *wqe)
2631	{
2632	struct tid_rdma_request *req;
2633	struct tid_rdma_flow *flow;
2634
2635	/ Start from the right segment /
2636	qp->r_flags \|= RVT_R_RDMAR_SEQ;
2637	req = wqe_to_tid_req(wqe);
2638	flow = &req->flows[req->clear_tail];
2639	hfi1_restart_rc(qp, psn: flow->flow_state.ib_spsn, wait: `0`);
2640	if (list_empty(head: &qp->rspwait)) {
2641	qp->r_flags \|= RVT_R_RSP_SEND;
2642	rvt_get_qp(qp);
2643	list_add_tail(new: &qp->rspwait, head: &rcd->qp_wait_list);
2644	}
2645	}
2646
2647	/*
2648	* Handle the KDETH eflags for TID RDMA READ response.
2649	*
2650	* Return true if the last packet for a segment has been received and it is
2651	* time to process the response normally; otherwise, return true.
2652	*
2653	* The caller must hold the packet->qp->r_lock and the rcu_read_lock.
2654	*/
2655	static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd,
2656	struct hfi1_packet *packet, u8 rcv_type,
2657	u8 rte, u32 psn, u32 ibpsn)
2658	__must_hold(&packet->qp->r_lock) __must_hold(RCU)
2659	{
2660	struct hfi1_pportdata *ppd = rcd->ppd;
2661	struct hfi1_devdata *dd = ppd->dd;
2662	struct hfi1_ibport *ibp;
2663	struct rvt_swqe *wqe;
2664	struct tid_rdma_request *req;
2665	struct tid_rdma_flow *flow;
2666	u32 ack_psn;
2667	struct rvt_qp *qp = packet->qp;
2668	struct hfi1_qp_priv *priv = qp->priv;
2669	bool ret = true;
2670	int diff = `0`;
2671	u32 fpsn;
2672
2673	lockdep_assert_held(&qp->r_lock);
2674	trace_hfi1_rsp_read_kdeth_eflags(qp, psn: ibpsn);
2675	trace_hfi1_sender_read_kdeth_eflags(qp);
2676	trace_hfi1_tid_read_sender_kdeth_eflags(qp, newreq: `0`);
2677	spin_lock(lock: &qp->s_lock);
2678	/ If the psn is out of valid range, drop the packet /
2679	if (cmp_psn(a: ibpsn, b: qp->s_last_psn) < `0` \|\|
2680	cmp_psn(a: ibpsn, b: qp->s_psn) > `0`)
2681	goto s_unlock;
2682
2683	/*
2684	* Note that NAKs implicitly ACK outstanding SEND and RDMA write
2685	* requests and implicitly NAK RDMA read and atomic requests issued
2686	* before the NAK'ed request.
2687	*/
2688	ack_psn = ibpsn - `1`;
2689	wqe = rvt_get_swqe_ptr(qp, n: qp->s_acked);
2690	ibp = to_iport(ibdev: qp->ibqp.device, port: qp->port_num);
2691
2692	/ Complete WQEs that the PSN finishes. /
2693	while ((int)delta_psn(a: ack_psn, b: wqe->lpsn) >= `0`) {
2694	/*
2695	* If this request is a RDMA read or atomic, and the NACK is
2696	* for a later operation, this NACK NAKs the RDMA read or
2697	* atomic.
2698	*/
2699	if (wqe->wr.opcode == IB_WR_RDMA_READ \|\|
2700	wqe->wr.opcode == IB_WR_TID_RDMA_READ \|\|
2701	wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP \|\|
2702	wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
2703	/ Retry this request. /
2704	if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
2705	qp->r_flags \|= RVT_R_RDMAR_SEQ;
2706	if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
2707	restart_tid_rdma_read_req(rcd, qp,
2708	wqe);
2709	} else {
2710	hfi1_restart_rc(qp, psn: qp->s_last_psn + `1`,
2711	wait: `0`);
2712	if (list_empty(head: &qp->rspwait)) {
2713	qp->r_flags \|= RVT_R_RSP_SEND;
2714	rvt_get_qp(qp);
2715	list_add_tail(/ wait /
2716	new: &qp->rspwait,
2717	head: &rcd->qp_wait_list);
2718	}
2719	}
2720	}
2721	/*
2722	* No need to process the NAK since we are
2723	* restarting an earlier request.
2724	*/
2725	break;
2726	}
2727
2728	wqe = do_rc_completion(qp, wqe, ibp);
2729	if (qp->s_acked == qp->s_tail)
2730	goto s_unlock;
2731	}
2732
2733	if (qp->s_acked == qp->s_tail)
2734	goto s_unlock;
2735
2736	/ Handle the eflags for the request /
2737	if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
2738	goto s_unlock;
2739
2740	req = wqe_to_tid_req(wqe);
2741	trace_hfi1_tid_req_read_kdeth_eflags(qp, newreq: `0`, opcode: wqe->wr.opcode, psn: wqe->psn,
2742	lpsn: wqe->lpsn, req);
2743	switch (rcv_type) {
2744	case RHF_RCV_TYPE_EXPECTED:
2745	switch (rte) {
2746	case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
2747	/*
2748	* On the first occurrence of a Flow Sequence error,
2749	* the flag TID_FLOW_SW_PSN is set.
2750	*
2751	* After that, the flow is not reprogrammed and the
2752	* protocol falls back to SW PSN checking. This is done
2753	* to prevent continuous Flow Sequence errors for any
2754	* packets that could be still in the fabric.
2755	*/
2756	flow = &req->flows[req->clear_tail];
2757	trace_hfi1_tid_flow_read_kdeth_eflags(qp,
2758	index: req->clear_tail,
2759	flow);
2760	if (priv->s_flags & HFI1_R_TID_SW_PSN) {
2761	diff = cmp_psn(a: psn,
2762	b: flow->flow_state.r_next_psn);
2763	if (diff > `0`) {
2764	/ Drop the packet./
2765	goto s_unlock;
2766	} else if (diff < `0`) {
2767	/*
2768	* If a response packet for a restarted
2769	* request has come back, reset the
2770	* restart flag.
2771	*/
2772	if (qp->r_flags & RVT_R_RDMAR_SEQ)
2773	qp->r_flags &=
2774	~RVT_R_RDMAR_SEQ;
2775
2776	/ Drop the packet./
2777	goto s_unlock;
2778	}
2779
2780	/*
2781	* If SW PSN verification is successful and
2782	* this is the last packet in the segment, tell
2783	* the caller to process it as a normal packet.
2784	*/
2785	fpsn = full_flow_psn(flow,
2786	psn: flow->flow_state.lpsn);
2787	if (cmp_psn(a: fpsn, b: psn) == `0`) {
2788	ret = false;
2789	if (qp->r_flags & RVT_R_RDMAR_SEQ)
2790	qp->r_flags &=
2791	~RVT_R_RDMAR_SEQ;
2792	}
2793	flow->flow_state.r_next_psn =
2794	mask_psn(a: psn + `1`);
2795	} else {
2796	u32 last_psn;
2797
2798	last_psn = read_r_next_psn(dd, ctxt: rcd->ctxt,
2799	fidx: flow->idx);
2800	flow->flow_state.r_next_psn = last_psn;
2801	priv->s_flags \|= HFI1_R_TID_SW_PSN;
2802	/*
2803	* If no request has been restarted yet,
2804	* restart the current one.
2805	*/
2806	if (!(qp->r_flags & RVT_R_RDMAR_SEQ))
2807	restart_tid_rdma_read_req(rcd, qp,
2808	wqe);
2809	}
2810
2811	break;
2812
2813	case RHF_RTE_EXPECTED_FLOW_GEN_ERR:
2814	/*
2815	* Since the TID flow is able to ride through
2816	* generation mismatch, drop this stale packet.
2817	*/
2818	break;
2819
2820	default:
2821	break;
2822	}
2823	break;
2824
2825	case RHF_RCV_TYPE_ERROR:
2826	switch (rte) {
2827	case RHF_RTE_ERROR_OP_CODE_ERR:
2828	case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR:
2829	case RHF_RTE_ERROR_KHDR_HCRC_ERR:
2830	case RHF_RTE_ERROR_KHDR_KVER_ERR:
2831	case RHF_RTE_ERROR_CONTEXT_ERR:
2832	case RHF_RTE_ERROR_KHDR_TID_ERR:
2833	default:
2834	break;
2835	}
2836	break;
2837	default:
2838	break;
2839	}
2840	s_unlock:
2841	spin_unlock(lock: &qp->s_lock);
2842	return ret;
2843	}
2844
2845	bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
2846	struct hfi1_pportdata *ppd,
2847	struct hfi1_packet *packet)
2848	{
2849	struct hfi1_ibport *ibp = &ppd->ibport_data;
2850	struct hfi1_devdata *dd = ppd->dd;
2851	struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
2852	u8 rcv_type = rhf_rcv_type(rhf: packet->rhf);
2853	u8 rte = rhf_rcv_type_err(rhf: packet->rhf);
2854	struct ib_header *hdr = packet->hdr;
2855	struct ib_other_headers *ohdr = NULL;
2856	int lnh = be16_to_cpu(hdr->lrh[`0`]) & `3`;
2857	u16 lid = be16_to_cpu(hdr->lrh[`1`]);
2858	u8 opcode;
2859	u32 qp_num, psn, ibpsn;
2860	struct rvt_qp *qp;
2861	struct hfi1_qp_priv *qpriv;
2862	unsigned long flags;
2863	bool ret = true;
2864	struct rvt_ack_entry *e;
2865	struct tid_rdma_request *req;
2866	struct tid_rdma_flow *flow;
2867	int diff = `0`;
2868
2869	trace_hfi1_msg_handle_kdeth_eflags(NULL, msg: "Kdeth error: rhf ",
2870	more: packet->rhf);
2871	if (packet->rhf & RHF_ICRC_ERR)
2872	return ret;
2873
2874	packet->ohdr = &hdr->u.oth;
2875	ohdr = packet->ohdr;
2876	trace_input_ibhdr(dd: rcd->dd, packet, sc5: !!(rhf_dc_info(rhf: packet->rhf)));
2877
2878	/ Get the destination QP number. /
2879	qp_num = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_qp) &
2880	RVT_QPN_MASK;
2881	if (lid >= be16_to_cpu(IB_MULTICAST_LID_BASE))
2882	goto drop;
2883
2884	psn = mask_psn(be32_to_cpu(ohdr->bth[`2`]));
2885	opcode = (be32_to_cpu(ohdr->bth[`0`]) >> `24`) & `0xff`;
2886
2887	rcu_read_lock();
2888	qp = rvt_lookup_qpn(rdi, rvp: &ibp->rvp, qpn: qp_num);
2889	if (!qp)
2890	goto rcu_unlock;
2891
2892	packet->qp = qp;
2893
2894	/ Check for valid receive state. /
2895	spin_lock_irqsave(&qp->r_lock, flags);
2896	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
2897	ibp->rvp.n_pkt_drops++;
2898	goto r_unlock;
2899	}
2900
2901	if (packet->rhf & RHF_TID_ERR) {
2902	/ For TIDERR and RC QPs preemptively schedule a NAK /
2903	u32 tlen = rhf_pkt_len(rhf: packet->rhf); / in bytes /
2904
2905	/ Sanity check packet /
2906	if (tlen < `24`)
2907	goto r_unlock;
2908
2909	/*
2910	* Check for GRH. We should never get packets with GRH in this
2911	* path.
2912	*/
2913	if (lnh == HFI1_LRH_GRH)
2914	goto r_unlock;
2915
2916	if (tid_rdma_tid_err(packet, rcv_type))
2917	goto r_unlock;
2918	}
2919
2920	/ handle TID RDMA READ /
2921	if (opcode == TID_OP(READ_RESP)) {
2922	ibpsn = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn);
2923	ibpsn = mask_psn(a: ibpsn);
2924	ret = handle_read_kdeth_eflags(rcd, packet, rcv_type, rte, psn,
2925	ibpsn);
2926	goto r_unlock;
2927	}
2928
2929	/*
2930	* qp->s_tail_ack_queue points to the rvt_ack_entry currently being
2931	* processed. These a completed sequentially so we can be sure that
2932	* the pointer will not change until the entire request has completed.
2933	*/
2934	spin_lock(lock: &qp->s_lock);
2935	qpriv = qp->priv;
2936	if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID \|\|
2937	qpriv->r_tid_tail == qpriv->r_tid_head)
2938	goto unlock;
2939	e = &qp->s_ack_queue[qpriv->r_tid_tail];
2940	if (e->opcode != TID_OP(WRITE_REQ))
2941	goto unlock;
2942	req = ack_to_tid_req(e);
2943	if (req->comp_seg == req->cur_seg)
2944	goto unlock;
2945	flow = &req->flows[req->clear_tail];
2946	trace_hfi1_eflags_err_write(qp, rcv_type, rte, psn);
2947	trace_hfi1_rsp_handle_kdeth_eflags(qp, psn);
2948	trace_hfi1_tid_write_rsp_handle_kdeth_eflags(qp);
2949	trace_hfi1_tid_req_handle_kdeth_eflags(qp, newreq: `0`, opcode: e->opcode, psn: e->psn,
2950	lpsn: e->lpsn, req);
2951	trace_hfi1_tid_flow_handle_kdeth_eflags(qp, index: req->clear_tail, flow);
2952
2953	switch (rcv_type) {
2954	case RHF_RCV_TYPE_EXPECTED:
2955	switch (rte) {
2956	case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
2957	if (!(qpriv->s_flags & HFI1_R_TID_SW_PSN)) {
2958	qpriv->s_flags \|= HFI1_R_TID_SW_PSN;
2959	flow->flow_state.r_next_psn =
2960	read_r_next_psn(dd, ctxt: rcd->ctxt,
2961	fidx: flow->idx);
2962	qpriv->r_next_psn_kdeth =
2963	flow->flow_state.r_next_psn;
2964	goto nak_psn;
2965	} else {
2966	/*
2967	* If the received PSN does not match the next
2968	* expected PSN, NAK the packet.
2969	* However, only do that if we know that the a
2970	* NAK has already been sent. Otherwise, this
2971	* mismatch could be due to packets that were
2972	* already in flight.
2973	*/
2974	diff = cmp_psn(a: psn,
2975	b: flow->flow_state.r_next_psn);
2976	if (diff > `0`)
2977	goto nak_psn;
2978	else if (diff < `0`)
2979	break;
2980
2981	qpriv->s_nak_state = `0`;
2982	/*
2983	* If SW PSN verification is successful and this
2984	* is the last packet in the segment, tell the
2985	* caller to process it as a normal packet.
2986	*/
2987	if (psn == full_flow_psn(flow,
2988	psn: flow->flow_state.lpsn))
2989	ret = false;
2990	flow->flow_state.r_next_psn =
2991	mask_psn(a: psn + `1`);
2992	qpriv->r_next_psn_kdeth =
2993	flow->flow_state.r_next_psn;
2994	}
2995	break;
2996
2997	case RHF_RTE_EXPECTED_FLOW_GEN_ERR:
2998	goto nak_psn;
2999
3000	default:
3001	break;
3002	}
3003	break;
3004
3005	case RHF_RCV_TYPE_ERROR:
3006	switch (rte) {
3007	case RHF_RTE_ERROR_OP_CODE_ERR:
3008	case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR:
3009	case RHF_RTE_ERROR_KHDR_HCRC_ERR:
3010	case RHF_RTE_ERROR_KHDR_KVER_ERR:
3011	case RHF_RTE_ERROR_CONTEXT_ERR:
3012	case RHF_RTE_ERROR_KHDR_TID_ERR:
3013	default:
3014	break;
3015	}
3016	break;
3017	default:
3018	break;
3019	}
3020
3021	unlock:
3022	spin_unlock(lock: &qp->s_lock);
3023	r_unlock:
3024	spin_unlock_irqrestore(lock: &qp->r_lock, flags);
3025	rcu_unlock:
3026	rcu_read_unlock();
3027	drop:
3028	return ret;
3029	nak_psn:
3030	ibp->rvp.n_rc_seqnak++;
3031	if (!qpriv->s_nak_state) {
3032	qpriv->s_nak_state = IB_NAK_PSN_ERROR;
3033	/ We are NAK'ing the next expected PSN /
3034	qpriv->s_nak_psn = mask_psn(a: flow->flow_state.r_next_psn);
3035	tid_rdma_trigger_ack(qp);
3036	}
3037	goto unlock;
3038	}
3039
3040	/*
3041	* "Rewind" the TID request information.
3042	* This means that we reset the state back to ACTIVE,
3043	* find the proper flow, set the flow index to that flow,
3044	* and reset the flow information.
3045	*/
3046	void hfi1_tid_rdma_restart_req(struct rvt_qp qp, struct* rvt_swqe *wqe,
3047	u32 *bth2)
3048	{
3049	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
3050	struct tid_rdma_flow *flow;
3051	struct hfi1_qp_priv *qpriv = qp->priv;
3052	int diff, delta_pkts;
3053	u32 tididx = `0`, i;
3054	u16 fidx;
3055
3056	if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
3057	*bth2 = mask_psn(a: qp->s_psn);
3058	flow = find_flow_ib(req, psn: *bth2, fidx: &fidx);
3059	if (!flow) {
3060	trace_hfi1_msg_tid_restart_req(/ msg /
3061	qp, msg: "!!!!!! Could not find flow to restart: bth2 ",
3062	more: (u64)*bth2);
3063	trace_hfi1_tid_req_restart_req(qp, newreq: `0`, opcode: wqe->wr.opcode,
3064	psn: wqe->psn, lpsn: wqe->lpsn,
3065	req);
3066	return;
3067	}
3068	} else {
3069	fidx = req->acked_tail;
3070	flow = &req->flows[fidx];
3071	*bth2 = mask_psn(a: req->r_ack_psn);
3072	}
3073
3074	if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
3075	delta_pkts = delta_psn(a: *bth2, b: flow->flow_state.ib_spsn);
3076	else
3077	delta_pkts = delta_psn(a: *bth2,
3078	b: full_flow_psn(flow,
3079	psn: flow->flow_state.spsn));
3080
3081	trace_hfi1_tid_flow_restart_req(qp, index: fidx, flow);
3082	diff = delta_pkts + flow->resync_npkts;
3083
3084	flow->sent = `0`;
3085	flow->pkt = `0`;
3086	flow->tid_idx = `0`;
3087	flow->tid_offset = `0`;
3088	if (diff) {
3089	for (tididx = `0`; tididx < flow->tidcnt; tididx++) {
3090	u32 tidentry = flow->tid_entry[tididx], tidlen,
3091	tidnpkts, npkts;
3092
3093	flow->tid_offset = `0`;
3094	tidlen = EXP_TID_GET(tidentry, LEN) * PAGE_SIZE;
3095	tidnpkts = rvt_div_round_up_mtu(qp, len: tidlen);
3096	npkts = min_t(u32, diff, tidnpkts);
3097	flow->pkt += npkts;
3098	flow->sent += (npkts == tidnpkts ? tidlen :
3099	npkts * qp->pmtu);
3100	flow->tid_offset += npkts * qp->pmtu;
3101	diff -= npkts;
3102	if (!diff)
3103	break;
3104	}
3105	}
3106	if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
3107	rvt_skip_sge(ss: &qpriv->tid_ss, length: (req->cur_seg * req->seg_len) +
3108	flow->sent, release: `0`);
3109	/*
3110	* Packet PSN is based on flow_state.spsn + flow->pkt. However,
3111	* during a RESYNC, the generation is incremented and the
3112	* sequence is reset to 0. Since we've adjusted the npkts in the
3113	* flow and the SGE has been sufficiently advanced, we have to
3114	* adjust flow->pkt in order to calculate the correct PSN.
3115	*/
3116	flow->pkt -= flow->resync_npkts;
3117	}
3118
3119	if (flow->tid_offset ==
3120	EXP_TID_GET(flow->tid_entry[tididx], LEN) * PAGE_SIZE) {
3121	tididx++;
3122	flow->tid_offset = `0`;
3123	}
3124	flow->tid_idx = tididx;
3125	if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
3126	/ Move flow_idx to correct index /
3127	req->flow_idx = fidx;
3128	else
3129	req->clear_tail = fidx;
3130
3131	trace_hfi1_tid_flow_restart_req(qp, index: fidx, flow);
3132	trace_hfi1_tid_req_restart_req(qp, newreq: `0`, opcode: wqe->wr.opcode, psn: wqe->psn,
3133	lpsn: wqe->lpsn, req);
3134	req->state = TID_REQUEST_ACTIVE;
3135	if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
3136	/ Reset all the flows that we are going to resend /
3137	fidx = CIRC_NEXT(fidx, MAX_FLOWS);
3138	i = qpriv->s_tid_tail;
3139	do {
3140	for (; CIRC_CNT(req->setup_head, fidx, MAX_FLOWS);
3141	fidx = CIRC_NEXT(fidx, MAX_FLOWS)) {
3142	req->flows[fidx].sent = `0`;
3143	req->flows[fidx].pkt = `0`;
3144	req->flows[fidx].tid_idx = `0`;
3145	req->flows[fidx].tid_offset = `0`;
3146	req->flows[fidx].resync_npkts = `0`;
3147	}
3148	if (i == qpriv->s_tid_cur)
3149	break;
3150	do {
3151	i = (++i == qp->s_size ? `0` : i);
3152	wqe = rvt_get_swqe_ptr(qp, n: i);
3153	} while (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE);
3154	req = wqe_to_tid_req(wqe);
3155	req->cur_seg = req->ack_seg;
3156	fidx = req->acked_tail;
3157	/ Pull req->clear_tail back /
3158	req->clear_tail = fidx;
3159	} while (`1`);
3160	}
3161	}
3162
3163	void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp)
3164	{
3165	int i, ret;
3166	struct hfi1_qp_priv *qpriv = qp->priv;
3167	struct tid_flow_state *fs;
3168
3169	if (qp->ibqp.qp_type != IB_QPT_RC \|\| !HFI1_CAP_IS_KSET(TID_RDMA))
3170	return;
3171
3172	/*
3173	* First, clear the flow to help prevent any delayed packets from
3174	* being delivered.
3175	*/
3176	fs = &qpriv->flow_state;
3177	if (fs->index != RXE_NUM_TID_FLOWS)
3178	hfi1_kern_clear_hw_flow(rcd: qpriv->rcd, qp);
3179
3180	for (i = qp->s_acked; i != qp->s_head;) {
3181	struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n: i);
3182
3183	if (++i == qp->s_size)
3184	i = `0`;
3185	/ Free only locally allocated TID entries /
3186	if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
3187	continue;
3188	do {
3189	struct hfi1_swqe_priv *priv = wqe->priv;
3190
3191	ret = hfi1_kern_exp_rcv_clear(req: &priv->tid_req);
3192	} while (!ret);
3193	}
3194	for (i = qp->s_acked_ack_queue; i != qp->r_head_ack_queue;) {
3195	struct rvt_ack_entry *e = &qp->s_ack_queue[i];
3196
3197	if (++i == rvt_max_atomic(rdi: ib_to_rvt(ibdev: qp->ibqp.device)))
3198	i = `0`;
3199	/ Free only locally allocated TID entries /
3200	if (e->opcode != TID_OP(WRITE_REQ))
3201	continue;
3202	do {
3203	struct hfi1_ack_priv *priv = e->priv;
3204
3205	ret = hfi1_kern_exp_rcv_clear(req: &priv->tid_req);
3206	} while (!ret);
3207	}
3208	}
3209
3210	bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp qp, struct* rvt_swqe *wqe)
3211	{
3212	struct rvt_swqe *prev;
3213	struct hfi1_qp_priv *priv = qp->priv;
3214	u32 s_prev;
3215	struct tid_rdma_request *req;
3216
3217	s_prev = (qp->s_cur == `0` ? qp->s_size : qp->s_cur) - `1`;
3218	prev = rvt_get_swqe_ptr(qp, n: s_prev);
3219
3220	switch (wqe->wr.opcode) {
3221	case IB_WR_SEND:
3222	case IB_WR_SEND_WITH_IMM:
3223	case IB_WR_SEND_WITH_INV:
3224	case IB_WR_ATOMIC_CMP_AND_SWP:
3225	case IB_WR_ATOMIC_FETCH_AND_ADD:
3226	case IB_WR_RDMA_WRITE:
3227	case IB_WR_RDMA_WRITE_WITH_IMM:
3228	switch (prev->wr.opcode) {
3229	case IB_WR_TID_RDMA_WRITE:
3230	req = wqe_to_tid_req(wqe: prev);
3231	if (req->ack_seg != req->total_segs)
3232	goto interlock;
3233	break;
3234	default:
3235	break;
3236	}
3237	break;
3238	case IB_WR_RDMA_READ:
3239	if (prev->wr.opcode != IB_WR_TID_RDMA_WRITE)
3240	break;
3241	fallthrough;
3242	case IB_WR_TID_RDMA_READ:
3243	switch (prev->wr.opcode) {
3244	case IB_WR_RDMA_READ:
3245	if (qp->s_acked != qp->s_cur)
3246	goto interlock;
3247	break;
3248	case IB_WR_TID_RDMA_WRITE:
3249	req = wqe_to_tid_req(wqe: prev);
3250	if (req->ack_seg != req->total_segs)
3251	goto interlock;
3252	break;
3253	default:
3254	break;
3255	}
3256	break;
3257	default:
3258	break;
3259	}
3260	return false;
3261
3262	interlock:
3263	priv->s_flags \|= HFI1_S_TID_WAIT_INTERLCK;
3264	return true;
3265	}
3266
3267	/ Does @sge meet the alignment requirements for tid rdma? /
3268	static inline bool hfi1_check_sge_align(struct rvt_qp *qp,
3269	struct rvt_sge sge, int* num_sge)
3270	{
3271	int i;
3272
3273	for (i = `0`; i < num_sge; i++, sge++) {
3274	trace_hfi1_sge_check_align(qp, index: i, sge);
3275	if ((u64)sge->vaddr & ~PAGE_MASK \|\|
3276	sge->sge_length & ~PAGE_MASK)
3277	return false;
3278	}
3279	return true;
3280	}
3281
3282	void setup_tid_rdma_wqe(struct rvt_qp qp, struct* rvt_swqe *wqe)
3283	{
3284	struct hfi1_qp_priv qpriv = (struct* hfi1_qp_priv *)qp->priv;
3285	struct hfi1_swqe_priv *priv = wqe->priv;
3286	struct tid_rdma_params *remote;
3287	enum ib_wr_opcode new_opcode;
3288	bool do_tid_rdma = false;
3289	struct hfi1_pportdata *ppd = qpriv->rcd->ppd;
3290
3291	if ((rdma_ah_get_dlid(attr: &qp->remote_ah_attr) & ~((`1` << ppd->lmc) - `1`)) ==
3292	ppd->lid)
3293	return;
3294	if (qpriv->hdr_type != HFI1_PKT_TYPE_9B)
3295	return;
3296
3297	rcu_read_lock();
3298	remote = rcu_dereference(qpriv->tid_rdma.remote);
3299	/*
3300	* If TID RDMA is disabled by the negotiation, don't
3301	* use it.
3302	*/
3303	if (!remote)
3304	goto exit;
3305
3306	if (wqe->wr.opcode == IB_WR_RDMA_READ) {
3307	if (hfi1_check_sge_align(qp, sge: &wqe->sg_list[`0`],
3308	num_sge: wqe->wr.num_sge)) {
3309	new_opcode = IB_WR_TID_RDMA_READ;
3310	do_tid_rdma = true;
3311	}
3312	} else if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
3313	/*
3314	* TID RDMA is enabled for this RDMA WRITE request iff:
3315	* 1. The remote address is page-aligned,
3316	* 2. The length is larger than the minimum segment size,
3317	* 3. The length is page-multiple.
3318	*/
3319	if (!(wqe->rdma_wr.remote_addr & ~PAGE_MASK) &&
3320	!(wqe->length & ~PAGE_MASK)) {
3321	new_opcode = IB_WR_TID_RDMA_WRITE;
3322	do_tid_rdma = true;
3323	}
3324	}
3325
3326	if (do_tid_rdma) {
3327	if (hfi1_kern_exp_rcv_alloc_flows(req: &priv->tid_req, GFP_ATOMIC))
3328	goto exit;
3329	wqe->wr.opcode = new_opcode;
3330	priv->tid_req.seg_len =
3331	min_t(u32, remote->max_len, wqe->length);
3332	priv->tid_req.total_segs =
3333	DIV_ROUND_UP(wqe->length, priv->tid_req.seg_len);
3334	/ Compute the last PSN of the request /
3335	wqe->lpsn = wqe->psn;
3336	if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
3337	priv->tid_req.n_flows = remote->max_read;
3338	qpriv->tid_r_reqs++;
3339	wqe->lpsn += rvt_div_round_up_mtu(qp, len: wqe->length) - `1`;
3340	} else {
3341	wqe->lpsn += priv->tid_req.total_segs - `1`;
3342	atomic_inc(v: &qpriv->n_requests);
3343	}
3344
3345	priv->tid_req.cur_seg = `0`;
3346	priv->tid_req.comp_seg = `0`;
3347	priv->tid_req.ack_seg = `0`;
3348	priv->tid_req.state = TID_REQUEST_INACTIVE;
3349	/*
3350	* Reset acked_tail.
3351	* TID RDMA READ does not have ACKs so it does not
3352	* update the pointer. We have to reset it so TID RDMA
3353	* WRITE does not get confused.
3354	*/
3355	priv->tid_req.acked_tail = priv->tid_req.setup_head;
3356	trace_hfi1_tid_req_setup_tid_wqe(qp, newreq: `1`, opcode: wqe->wr.opcode,
3357	psn: wqe->psn, lpsn: wqe->lpsn,
3358	req: &priv->tid_req);
3359	}
3360	exit:
3361	rcu_read_unlock();
3362	}
3363
3364	/ TID RDMA WRITE functions /
3365
3366	u32 hfi1_build_tid_rdma_write_req(struct rvt_qp qp, struct* rvt_swqe *wqe,
3367	struct ib_other_headers *ohdr,
3368	u32 bth1, u32 bth2, u32 *len)
3369	{
3370	struct hfi1_qp_priv *qpriv = qp->priv;
3371	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
3372	struct tid_rdma_params *remote;
3373
3374	rcu_read_lock();
3375	remote = rcu_dereference(qpriv->tid_rdma.remote);
3376	/*
3377	* Set the number of flow to be used based on negotiated
3378	* parameters.
3379	*/
3380	req->n_flows = remote->max_write;
3381	req->state = TID_REQUEST_ACTIVE;
3382
3383	KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth0, KVER, `0x1`);
3384	KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth1, JKEY, remote->jkey);
3385	ohdr->u.tid_rdma.w_req.reth.vaddr =
3386	cpu_to_be64(wqe->rdma_wr.remote_addr + (wqe->length - *len));
3387	ohdr->u.tid_rdma.w_req.reth.rkey =
3388	cpu_to_be32(wqe->rdma_wr.rkey);
3389	ohdr->u.tid_rdma.w_req.reth.length = cpu_to_be32(*len);
3390	ohdr->u.tid_rdma.w_req.verbs_qp = cpu_to_be32(qp->remote_qpn);
3391	*bth1 &= ~RVT_QPN_MASK;
3392	*bth1 \|= remote->qp;
3393	qp->s_state = TID_OP(WRITE_REQ);
3394	qp->s_flags \|= HFI1_S_WAIT_TID_RESP;
3395	*bth2 \|= IB_BTH_REQ_ACK;
3396	*len = `0`;
3397
3398	rcu_read_unlock();
3399	return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32);
3400	}
3401
3402	static u32 hfi1_compute_tid_rdma_flow_wt(struct rvt_qp *qp)
3403	{
3404	/*
3405	* Heuristic for computing the RNR timeout when waiting on the flow
3406	* queue. Rather than a computationaly expensive exact estimate of when
3407	* a flow will be available, we assume that if a QP is at position N in
3408	* the flow queue it has to wait approximately (N + 1) * (number of
3409	* segments between two sync points). The rationale for this is that
3410	* flows are released and recycled at each sync point.
3411	*/
3412	return (MAX_TID_FLOW_PSN * qp->pmtu) >> TID_RDMA_SEGMENT_SHIFT;
3413	}
3414
3415	static u32 position_in_queue(struct hfi1_qp_priv *qpriv,
3416	struct tid_queue *queue)
3417	{
3418	return qpriv->tid_enqueue - queue->dequeue;
3419	}
3420
3421	/*
3422	* @qp: points to rvt_qp context.
3423	* @to_seg: desired RNR timeout in segments.
3424	* Return: index of the next highest timeout in the ib_hfi1_rnr_table[]
3425	*/
3426	static u32 hfi1_compute_tid_rnr_timeout(struct rvt_qp *qp, u32 to_seg)
3427	{
3428	struct hfi1_qp_priv *qpriv = qp->priv;
3429	u64 timeout;
3430	u32 bytes_per_us;
3431	u8 i;
3432
3433	bytes_per_us = active_egress_rate(ppd: qpriv->rcd->ppd) / `8`;
3434	timeout = (to_seg * TID_RDMA_MAX_SEGMENT_SIZE) / bytes_per_us;
3435	/*
3436	* Find the next highest value in the RNR table to the required
3437	* timeout. This gives the responder some padding.
3438	*/
3439	for (i = `1`; i <= IB_AETH_CREDIT_MASK; i++)
3440	if (rvt_rnr_tbl_to_usec(index: i) >= timeout)
3441	return i;
3442	return `0`;
3443	}
3444
3445	/*
3446	* Central place for resource allocation at TID write responder,
3447	* is called from write_req and write_data interrupt handlers as
3448	* well as the send thread when a queued QP is scheduled for
3449	* resource allocation.
3450	*
3451	* Iterates over (a) segments of a request and then (b) queued requests
3452	* themselves to allocate resources for up to local->max_write
3453	* segments across multiple requests. Stop allocating when we
3454	* hit a sync point, resume allocating after data packets at
3455	* sync point have been received.
3456	*
3457	* Resource allocation and sending of responses is decoupled. The
3458	* request/segment which are being allocated and sent are as follows.
3459	* Resources are allocated for:
3460	* [request: qpriv->r_tid_alloc, segment: req->alloc_seg]
3461	* The send thread sends:
3462	* [request: qp->s_tail_ack_queue, segment:req->cur_seg]
3463	*/
3464	static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx)
3465	{
3466	struct tid_rdma_request *req;
3467	struct hfi1_qp_priv *qpriv = qp->priv;
3468	struct hfi1_ctxtdata *rcd = qpriv->rcd;
3469	struct tid_rdma_params *local = &qpriv->tid_rdma.local;
3470	struct rvt_ack_entry *e;
3471	u32 npkts, to_seg;
3472	bool last;
3473	int ret = `0`;
3474
3475	lockdep_assert_held(&qp->s_lock);
3476
3477	while (`1`) {
3478	trace_hfi1_rsp_tid_write_alloc_res(qp, psn: `0`);
3479	trace_hfi1_tid_write_rsp_alloc_res(qp);
3480	/*
3481	* Don't allocate more segments if a RNR NAK has already been
3482	* scheduled to avoid messing up qp->r_psn: the RNR NAK will
3483	* be sent only when all allocated segments have been sent.
3484	* However, if more segments are allocated before that, TID RDMA
3485	* WRITE RESP packets will be sent out for these new segments
3486	* before the RNR NAK packet. When the requester receives the
3487	* RNR NAK packet, it will restart with qp->s_last_psn + 1,
3488	* which does not match qp->r_psn and will be dropped.
3489	* Consequently, the requester will exhaust its retries and
3490	* put the qp into error state.
3491	*/
3492	if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND)
3493	break;
3494
3495	/ No requests left to process /
3496	if (qpriv->r_tid_alloc == qpriv->r_tid_head) {
3497	/ If all data has been received, clear the flow /
3498	if (qpriv->flow_state.index < RXE_NUM_TID_FLOWS &&
3499	!qpriv->alloc_w_segs) {
3500	hfi1_kern_clear_hw_flow(rcd, qp);
3501	qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
3502	}
3503	break;
3504	}
3505
3506	e = &qp->s_ack_queue[qpriv->r_tid_alloc];
3507	if (e->opcode != TID_OP(WRITE_REQ))
3508	goto next_req;
3509	req = ack_to_tid_req(e);
3510	trace_hfi1_tid_req_write_alloc_res(qp, newreq: `0`, opcode: e->opcode, psn: e->psn,
3511	lpsn: e->lpsn, req);
3512	/ Finished allocating for all segments of this request /
3513	if (req->alloc_seg >= req->total_segs)
3514	goto next_req;
3515
3516	/ Can allocate only a maximum of local->max_write for a QP /
3517	if (qpriv->alloc_w_segs >= local->max_write)
3518	break;
3519
3520	/ Don't allocate at a sync point with data packets pending /
3521	if (qpriv->sync_pt && qpriv->alloc_w_segs)
3522	break;
3523
3524	/ All data received at the sync point, continue /
3525	if (qpriv->sync_pt && !qpriv->alloc_w_segs) {
3526	hfi1_kern_clear_hw_flow(rcd, qp);
3527	qpriv->sync_pt = false;
3528	qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
3529	}
3530
3531	/ Allocate flow if we don't have one /
3532	if (qpriv->flow_state.index >= RXE_NUM_TID_FLOWS) {
3533	ret = hfi1_kern_setup_hw_flow(rcd: qpriv->rcd, qp);
3534	if (ret) {
3535	to_seg = hfi1_compute_tid_rdma_flow_wt(qp) *
3536	position_in_queue(qpriv,
3537	queue: &rcd->flow_queue);
3538	break;
3539	}
3540	}
3541
3542	npkts = rvt_div_round_up_mtu(qp, len: req->seg_len);
3543
3544	/*
3545	* We are at a sync point if we run out of KDETH PSN space.
3546	* Last PSN of every generation is reserved for RESYNC.
3547	*/
3548	if (qpriv->flow_state.psn + npkts > MAX_TID_FLOW_PSN - `1`) {
3549	qpriv->sync_pt = true;
3550	break;
3551	}
3552
3553	/*
3554	* If overtaking req->acked_tail, send an RNR NAK. Because the
3555	* QP is not queued in this case, and the issue can only be
3556	* caused by a delay in scheduling the second leg which we
3557	* cannot estimate, we use a rather arbitrary RNR timeout of
3558	* (MAX_FLOWS / 2) segments
3559	*/
3560	if (!CIRC_SPACE(req->setup_head, req->acked_tail,
3561	MAX_FLOWS)) {
3562	ret = -EAGAIN;
3563	to_seg = MAX_FLOWS >> `1`;
3564	tid_rdma_trigger_ack(qp);
3565	break;
3566	}
3567
3568	/ Try to allocate rcv array / TID entries /
3569	ret = hfi1_kern_exp_rcv_setup(req, ss: &req->ss, last: &last);
3570	if (ret == -EAGAIN)
3571	to_seg = position_in_queue(qpriv, queue: &rcd->rarr_queue);
3572	if (ret)
3573	break;
3574
3575	qpriv->alloc_w_segs++;
3576	req->alloc_seg++;
3577	continue;
3578	next_req:
3579	/ Begin processing the next request /
3580	if (++qpriv->r_tid_alloc >
3581	rvt_size_atomic(rdi: ib_to_rvt(ibdev: qp->ibqp.device)))
3582	qpriv->r_tid_alloc = `0`;
3583	}
3584
3585	/*
3586	* Schedule an RNR NAK to be sent if (a) flow or rcv array allocation
3587	* has failed (b) we are called from the rcv handler interrupt context
3588	* (c) an RNR NAK has not already been scheduled
3589	*/
3590	if (ret == -EAGAIN && intr_ctx && !qp->r_nak_state)
3591	goto send_rnr_nak;
3592
3593	return;
3594
3595	send_rnr_nak:
3596	lockdep_assert_held(&qp->r_lock);
3597
3598	/ Set r_nak_state to prevent unrelated events from generating NAK's /
3599	qp->r_nak_state = hfi1_compute_tid_rnr_timeout(qp, to_seg) \| IB_RNR_NAK;
3600
3601	/ Pull back r_psn to the segment being RNR NAK'd /
3602	qp->r_psn = e->psn + req->alloc_seg;
3603	qp->r_ack_psn = qp->r_psn;
3604	/*
3605	* Pull back r_head_ack_queue to the ack entry following the request
3606	* being RNR NAK'd. This allows resources to be allocated to the request
3607	* if the queued QP is scheduled.
3608	*/
3609	qp->r_head_ack_queue = qpriv->r_tid_alloc + `1`;
3610	if (qp->r_head_ack_queue > rvt_size_atomic(rdi: ib_to_rvt(ibdev: qp->ibqp.device)))
3611	qp->r_head_ack_queue = `0`;
3612	qpriv->r_tid_head = qp->r_head_ack_queue;
3613	/*
3614	* These send side fields are used in make_rc_ack(). They are set in
3615	* hfi1_send_rc_ack() but must be set here before dropping qp->s_lock
3616	* for consistency
3617	*/
3618	qp->s_nak_state = qp->r_nak_state;
3619	qp->s_ack_psn = qp->r_ack_psn;
3620	/*
3621	* Clear the ACK PENDING flag to prevent unwanted ACK because we
3622	* have modified qp->s_ack_psn here.
3623	*/
3624	qp->s_flags &= ~(RVT_S_ACK_PENDING);
3625
3626	trace_hfi1_rsp_tid_write_alloc_res(qp, psn: qp->r_psn);
3627	/*
3628	* qpriv->rnr_nak_state is used to determine when the scheduled RNR NAK
3629	* has actually been sent. qp->s_flags RVT_S_ACK_PENDING bit cannot be
3630	* used for this because qp->s_lock is dropped before calling
3631	* hfi1_send_rc_ack() leading to inconsistency between the receive
3632	* interrupt handlers and the send thread in make_rc_ack()
3633	*/
3634	qpriv->rnr_nak_state = TID_RNR_NAK_SEND;
3635
3636	/*
3637	* Schedule RNR NAK to be sent. RNR NAK's are scheduled from the receive
3638	* interrupt handlers but will be sent from the send engine behind any
3639	* previous responses that may have been scheduled
3640	*/
3641	rc_defered_ack(rcd, qp);
3642	}
3643
3644	void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet)
3645	{
3646	/ HANDLER FOR TID RDMA WRITE REQUEST packet (Responder side)/
3647
3648	/*
3649	* 1. Verify TID RDMA WRITE REQ as per IB_OPCODE_RC_RDMA_WRITE_FIRST
3650	* (see hfi1_rc_rcv())
3651	* - Don't allow 0-length requests.
3652	* 2. Put TID RDMA WRITE REQ into the response queue (s_ack_queue)
3653	* - Setup struct tid_rdma_req with request info
3654	* - Prepare struct tid_rdma_flow array?
3655	* 3. Set the qp->s_ack_state as state diagram in design doc.
3656	* 4. Set RVT_S_RESP_PENDING in s_flags.
3657	* 5. Kick the send engine (hfi1_schedule_send())
3658	*/
3659	struct hfi1_ctxtdata *rcd = packet->rcd;
3660	struct rvt_qp *qp = packet->qp;
3661	struct hfi1_ibport *ibp = to_iport(ibdev: qp->ibqp.device, port: qp->port_num);
3662	struct ib_other_headers *ohdr = packet->ohdr;
3663	struct rvt_ack_entry *e;
3664	unsigned long flags;
3665	struct ib_reth *reth;
3666	struct hfi1_qp_priv *qpriv = qp->priv;
3667	struct tid_rdma_request *req;
3668	u32 bth0, psn, len, rkey, num_segs;
3669	bool fecn;
3670	u8 next;
3671	u64 vaddr;
3672	int diff;
3673
3674	bth0 = be32_to_cpu(ohdr->bth[`0`]);
3675	if (hfi1_ruc_check_hdr(ibp, packet))
3676	return;
3677
3678	fecn = process_ecn(qp, pkt: packet);
3679	psn = mask_psn(be32_to_cpu(ohdr->bth[`2`]));
3680	trace_hfi1_rsp_rcv_tid_write_req(qp, psn);
3681
3682	if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
3683	rvt_comm_est(qp);
3684
3685	if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
3686	goto nack_inv;
3687
3688	reth = &ohdr->u.tid_rdma.w_req.reth;
3689	vaddr = be64_to_cpu(reth->vaddr);
3690	len = be32_to_cpu(reth->length);
3691
3692	num_segs = DIV_ROUND_UP(len, qpriv->tid_rdma.local.max_len);
3693	diff = delta_psn(a: psn, b: qp->r_psn);
3694	if (unlikely(diff)) {
3695	tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn);
3696	return;
3697	}
3698
3699	/*
3700	* The resent request which was previously RNR NAK'd is inserted at the
3701	* location of the original request, which is one entry behind
3702	* r_head_ack_queue
3703	*/
3704	if (qpriv->rnr_nak_state)
3705	qp->r_head_ack_queue = qp->r_head_ack_queue ?
3706	qp->r_head_ack_queue - `1` :
3707	rvt_size_atomic(rdi: ib_to_rvt(ibdev: qp->ibqp.device));
3708
3709	/ We've verified the request, insert it into the ack queue. /
3710	next = qp->r_head_ack_queue + `1`;
3711	if (next > rvt_size_atomic(rdi: ib_to_rvt(ibdev: qp->ibqp.device)))
3712	next = `0`;
3713	spin_lock_irqsave(&qp->s_lock, flags);
3714	if (unlikely(next == qp->s_acked_ack_queue)) {
3715	if (!qp->s_ack_queue[next].sent)
3716	goto nack_inv_unlock;
3717	update_ack_queue(qp, n: next);
3718	}
3719	e = &qp->s_ack_queue[qp->r_head_ack_queue];
3720	req = ack_to_tid_req(e);
3721
3722	/ Bring previously RNR NAK'd request back to life /
3723	if (qpriv->rnr_nak_state) {
3724	qp->r_nak_state = `0`;
3725	qp->s_nak_state = `0`;
3726	qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
3727	qp->r_psn = e->lpsn + `1`;
3728	req->state = TID_REQUEST_INIT;
3729	goto update_head;
3730	}
3731
3732	release_rdma_sge_mr(e);
3733
3734	/ The length needs to be in multiples of PAGE_SIZE /
3735	if (!len \|\| len & ~PAGE_MASK)
3736	goto nack_inv_unlock;
3737
3738	rkey = be32_to_cpu(reth->rkey);
3739	qp->r_len = len;
3740
3741	if (e->opcode == TID_OP(WRITE_REQ) &&
3742	(req->setup_head != req->clear_tail \|\|
3743	req->clear_tail != req->acked_tail))
3744	goto nack_inv_unlock;
3745
3746	if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr,
3747	rkey, IB_ACCESS_REMOTE_WRITE)))
3748	goto nack_acc;
3749
3750	qp->r_psn += num_segs - `1`;
3751
3752	e->opcode = (bth0 >> `24`) & `0xff`;
3753	e->psn = psn;
3754	e->lpsn = qp->r_psn;
3755	e->sent = `0`;
3756
3757	req->n_flows = min_t(u16, num_segs, qpriv->tid_rdma.local.max_write);
3758	req->state = TID_REQUEST_INIT;
3759	req->cur_seg = `0`;
3760	req->comp_seg = `0`;
3761	req->ack_seg = `0`;
3762	req->alloc_seg = `0`;
3763	req->isge = `0`;
3764	req->seg_len = qpriv->tid_rdma.local.max_len;
3765	req->total_len = len;
3766	req->total_segs = num_segs;
3767	req->r_flow_psn = e->psn;
3768	req->ss.sge = e->rdma_sge;
3769	req->ss.num_sge = `1`;
3770
3771	req->flow_idx = req->setup_head;
3772	req->clear_tail = req->setup_head;
3773	req->acked_tail = req->setup_head;
3774
3775	qp->r_state = e->opcode;
3776	qp->r_nak_state = `0`;
3777	/*
3778	* We need to increment the MSN here instead of when we
3779	* finish sending the result since a duplicate request would
3780	* increment it more than once.
3781	*/
3782	qp->r_msn++;
3783	qp->r_psn++;
3784
3785	trace_hfi1_tid_req_rcv_write_req(qp, newreq: `0`, opcode: e->opcode, psn: e->psn, lpsn: e->lpsn,
3786	req);
3787
3788	if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID) {
3789	qpriv->r_tid_tail = qp->r_head_ack_queue;
3790	} else if (qpriv->r_tid_tail == qpriv->r_tid_head) {
3791	struct tid_rdma_request *ptr;
3792
3793	e = &qp->s_ack_queue[qpriv->r_tid_tail];
3794	ptr = ack_to_tid_req(e);
3795
3796	if (e->opcode != TID_OP(WRITE_REQ) \|\|
3797	ptr->comp_seg == ptr->total_segs) {
3798	if (qpriv->r_tid_tail == qpriv->r_tid_ack)
3799	qpriv->r_tid_ack = qp->r_head_ack_queue;
3800	qpriv->r_tid_tail = qp->r_head_ack_queue;
3801	}
3802	}
3803	update_head:
3804	qp->r_head_ack_queue = next;
3805	qpriv->r_tid_head = qp->r_head_ack_queue;
3806
3807	hfi1_tid_write_alloc_resources(qp, intr_ctx: true);
3808	trace_hfi1_tid_write_rsp_rcv_req(qp);
3809
3810	/ Schedule the send tasklet. /
3811	qp->s_flags \|= RVT_S_RESP_PENDING;
3812	if (fecn)
3813	qp->s_flags \|= RVT_S_ECN;
3814	hfi1_schedule_send(qp);
3815
3816	spin_unlock_irqrestore(lock: &qp->s_lock, flags);
3817	return;
3818
3819	nack_inv_unlock:
3820	spin_unlock_irqrestore(lock: &qp->s_lock, flags);
3821	nack_inv:
3822	rvt_rc_error(qp, err: IB_WC_LOC_QP_OP_ERR);
3823	qp->r_nak_state = IB_NAK_INVALID_REQUEST;
3824	qp->r_ack_psn = qp->r_psn;
3825	/ Queue NAK for later /
3826	rc_defered_ack(rcd, qp);
3827	return;
3828	nack_acc:
3829	spin_unlock_irqrestore(lock: &qp->s_lock, flags);
3830	rvt_rc_error(qp, err: IB_WC_LOC_PROT_ERR);
3831	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
3832	qp->r_ack_psn = qp->r_psn;
3833	}
3834
3835	u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp qp, struct* rvt_ack_entry *e,
3836	struct ib_other_headers ohdr, u32 bth1,
3837	u32 bth2, u32 *len,
3838	struct rvt_sge_state **ss)
3839	{
3840	struct hfi1_ack_priv *epriv = e->priv;
3841	struct tid_rdma_request *req = &epriv->tid_req;
3842	struct hfi1_qp_priv *qpriv = qp->priv;
3843	struct tid_rdma_flow *flow = NULL;
3844	u32 resp_len = `0`, hdwords = `0`;
3845	void *resp_addr = NULL;
3846	struct tid_rdma_params *remote;
3847
3848	trace_hfi1_tid_req_build_write_resp(qp, newreq: `0`, opcode: e->opcode, psn: e->psn, lpsn: e->lpsn,
3849	req);
3850	trace_hfi1_tid_write_rsp_build_resp(qp);
3851	trace_hfi1_rsp_build_tid_write_resp(qp, psn: bth2);
3852	flow = &req->flows[req->flow_idx];
3853	switch (req->state) {
3854	default:
3855	/*
3856	* Try to allocate resources here in case QP was queued and was
3857	* later scheduled when resources became available
3858	*/
3859	hfi1_tid_write_alloc_resources(qp, intr_ctx: false);
3860
3861	/ We've already sent everything which is ready /
3862	if (req->cur_seg >= req->alloc_seg)
3863	goto done;
3864
3865	/*
3866	* Resources can be assigned but responses cannot be sent in
3867	* rnr_nak state, till the resent request is received
3868	*/
3869	if (qpriv->rnr_nak_state == TID_RNR_NAK_SENT)
3870	goto done;
3871
3872	req->state = TID_REQUEST_ACTIVE;
3873	trace_hfi1_tid_flow_build_write_resp(qp, index: req->flow_idx, flow);
3874	req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS);
3875	hfi1_add_tid_reap_timer(qp);
3876	break;
3877
3878	case TID_REQUEST_RESEND_ACTIVE:
3879	case TID_REQUEST_RESEND:
3880	trace_hfi1_tid_flow_build_write_resp(qp, index: req->flow_idx, flow);
3881	req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS);
3882	if (!CIRC_CNT(req->setup_head, req->flow_idx, MAX_FLOWS))
3883	req->state = TID_REQUEST_ACTIVE;
3884
3885	hfi1_mod_tid_reap_timer(qp);
3886	break;
3887	}
3888	flow->flow_state.resp_ib_psn = bth2;
3889	resp_addr = (void *)flow->tid_entry;
3890	resp_len = sizeof(flow->tid_entry) flow->tidcnt;
3891	req->cur_seg++;
3892
3893	memset(&ohdr->u.tid_rdma.w_rsp, `0`, sizeof(ohdr->u.tid_rdma.w_rsp));
3894	epriv->ss.sge.vaddr = resp_addr;
3895	epriv->ss.sge.sge_length = resp_len;
3896	epriv->ss.sge.length = epriv->ss.sge.sge_length;
3897	/*
3898	* We can safely zero these out. Since the first SGE covers the
3899	* entire packet, nothing else should even look at the MR.
3900	*/
3901	epriv->ss.sge.mr = NULL;
3902	epriv->ss.sge.m = `0`;
3903	epriv->ss.sge.n = `0`;
3904
3905	epriv->ss.sg_list = NULL;
3906	epriv->ss.total_len = epriv->ss.sge.sge_length;
3907	epriv->ss.num_sge = `1`;
3908
3909	*ss = &epriv->ss;
3910	*len = epriv->ss.total_len;
3911
3912	/ Construct the TID RDMA WRITE RESP packet header /
3913	rcu_read_lock();
3914	remote = rcu_dereference(qpriv->tid_rdma.remote);
3915
3916	KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth0, KVER, `0x1`);
3917	KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth1, JKEY, remote->jkey);
3918	ohdr->u.tid_rdma.w_rsp.aeth = rvt_compute_aeth(qp);
3919	ohdr->u.tid_rdma.w_rsp.tid_flow_psn =
3920	cpu_to_be32((flow->flow_state.generation <<
3921	HFI1_KDETH_BTH_SEQ_SHIFT) \|
3922	(flow->flow_state.spsn &
3923	HFI1_KDETH_BTH_SEQ_MASK));
3924	ohdr->u.tid_rdma.w_rsp.tid_flow_qp =
3925	cpu_to_be32(qpriv->tid_rdma.local.qp \|
3926	((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
3927	TID_RDMA_DESTQP_FLOW_SHIFT) \|
3928	qpriv->rcd->ctxt);
3929	ohdr->u.tid_rdma.w_rsp.verbs_qp = cpu_to_be32(qp->remote_qpn);
3930	*bth1 = remote->qp;
3931	rcu_read_unlock();
3932	hdwords = sizeof(ohdr->u.tid_rdma.w_rsp) / sizeof(u32);
3933	qpriv->pending_tid_w_segs++;
3934	done:
3935	return hdwords;
3936	}
3937
3938	static void hfi1_add_tid_reap_timer(struct rvt_qp *qp)
3939	{
3940	struct hfi1_qp_priv *qpriv = qp->priv;
3941
3942	lockdep_assert_held(&qp->s_lock);
3943	if (!(qpriv->s_flags & HFI1_R_TID_RSC_TIMER)) {
3944	qpriv->s_flags \|= HFI1_R_TID_RSC_TIMER;
3945	qpriv->s_tid_timer.expires = jiffies +
3946	qpriv->tid_timer_timeout_jiffies;
3947	add_timer(timer: &qpriv->s_tid_timer);
3948	}
3949	}
3950
3951	static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp)
3952	{
3953	struct hfi1_qp_priv *qpriv = qp->priv;
3954
3955	lockdep_assert_held(&qp->s_lock);
3956	qpriv->s_flags \|= HFI1_R_TID_RSC_TIMER;
3957	mod_timer(timer: &qpriv->s_tid_timer, expires: jiffies +
3958	qpriv->tid_timer_timeout_jiffies);
3959	}
3960
3961	static int hfi1_stop_tid_reap_timer(struct rvt_qp *qp)
3962	{
3963	struct hfi1_qp_priv *qpriv = qp->priv;
3964	int rval = `0`;
3965
3966	lockdep_assert_held(&qp->s_lock);
3967	if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) {
3968	rval = del_timer(timer: &qpriv->s_tid_timer);
3969	qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER;
3970	}
3971	return rval;
3972	}
3973
3974	void hfi1_del_tid_reap_timer(struct rvt_qp *qp)
3975	{
3976	struct hfi1_qp_priv *qpriv = qp->priv;
3977
3978	del_timer_sync(timer: &qpriv->s_tid_timer);
3979	qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER;
3980	}
3981
3982	static void hfi1_tid_timeout(struct timer_list *t)
3983	{
3984	struct hfi1_qp_priv *qpriv = from_timer(qpriv, t, s_tid_timer);
3985	struct rvt_qp *qp = qpriv->owner;
3986	struct rvt_dev_info *rdi = ib_to_rvt(ibdev: qp->ibqp.device);
3987	unsigned long flags;
3988	u32 i;
3989
3990	spin_lock_irqsave(&qp->r_lock, flags);
3991	spin_lock(lock: &qp->s_lock);
3992	if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) {
3993	dd_dev_warn(dd_from_ibdev(qp->ibqp.device), "[QP%u] %s %d\n",
3994	qp->ibqp.qp_num, __func__, __LINE__);
3995	trace_hfi1_msg_tid_timeout(/ msg /
3996	qp, msg: "resource timeout = ",
3997	more: (u64)qpriv->tid_timer_timeout_jiffies);
3998	hfi1_stop_tid_reap_timer(qp);
3999	/*
4000	* Go though the entire ack queue and clear any outstanding
4001	* HW flow and RcvArray resources.
4002	*/
4003	hfi1_kern_clear_hw_flow(rcd: qpriv->rcd, qp);
4004	for (i = `0`; i < rvt_max_atomic(rdi); i++) {
4005	struct tid_rdma_request *req =
4006	ack_to_tid_req(e: &qp->s_ack_queue[i]);
4007
4008	hfi1_kern_exp_rcv_clear_all(req);
4009	}
4010	spin_unlock(lock: &qp->s_lock);
4011	if (qp->ibqp.event_handler) {
4012	struct ib_event ev;
4013
4014	ev.device = qp->ibqp.device;
4015	ev.element.qp = &qp->ibqp;
4016	ev.event = IB_EVENT_QP_FATAL;
4017	qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
4018	}
4019	rvt_rc_error(qp, err: IB_WC_RESP_TIMEOUT_ERR);
4020	goto unlock_r_lock;
4021	}
4022	spin_unlock(lock: &qp->s_lock);
4023	unlock_r_lock:
4024	spin_unlock_irqrestore(lock: &qp->r_lock, flags);
4025	}
4026
4027	void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet)
4028	{
4029	/ HANDLER FOR TID RDMA WRITE RESPONSE packet (Requester side) /
4030
4031	/*
4032	* 1. Find matching SWQE
4033	* 2. Check that TIDENTRY array has enough space for a complete
4034	* segment. If not, put QP in error state.
4035	* 3. Save response data in struct tid_rdma_req and struct tid_rdma_flow
4036	* 4. Remove HFI1_S_WAIT_TID_RESP from s_flags.
4037	* 5. Set qp->s_state
4038	* 6. Kick the send engine (hfi1_schedule_send())
4039	*/
4040	struct ib_other_headers *ohdr = packet->ohdr;
4041	struct rvt_qp *qp = packet->qp;
4042	struct hfi1_qp_priv *qpriv = qp->priv;
4043	struct hfi1_ctxtdata *rcd = packet->rcd;
4044	struct rvt_swqe *wqe;
4045	struct tid_rdma_request *req;
4046	struct tid_rdma_flow *flow;
4047	enum ib_wc_status status;
4048	u32 opcode, aeth, psn, flow_psn, i, tidlen = `0`, pktlen;
4049	bool fecn;
4050	unsigned long flags;
4051
4052	fecn = process_ecn(qp, pkt: packet);
4053	psn = mask_psn(be32_to_cpu(ohdr->bth[`2`]));
4054	aeth = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.aeth);
4055	opcode = (be32_to_cpu(ohdr->bth[`0`]) >> `24`) & `0xff`;
4056
4057	spin_lock_irqsave(&qp->s_lock, flags);
4058
4059	/ Ignore invalid responses /
4060	if (cmp_psn(a: psn, b: qp->s_next_psn) >= `0`)
4061	goto ack_done;
4062
4063	/ Ignore duplicate responses. /
4064	if (unlikely(cmp_psn(psn, qp->s_last_psn) <= `0`))
4065	goto ack_done;
4066
4067	if (unlikely(qp->s_acked == qp->s_tail))
4068	goto ack_done;
4069
4070	/*
4071	* If we are waiting for a particular packet sequence number
4072	* due to a request being resent, check for it. Otherwise,
4073	* ensure that we haven't missed anything.
4074	*/
4075	if (qp->r_flags & RVT_R_RDMAR_SEQ) {
4076	if (cmp_psn(a: psn, b: qp->s_last_psn + `1`) != `0`)
4077	goto ack_done;
4078	qp->r_flags &= ~RVT_R_RDMAR_SEQ;
4079	}
4080
4081	wqe = rvt_get_swqe_ptr(qp, n: qpriv->s_tid_cur);
4082	if (unlikely(wqe->wr.opcode != IB_WR_TID_RDMA_WRITE))
4083	goto ack_op_err;
4084
4085	req = wqe_to_tid_req(wqe);
4086	/*
4087	* If we've lost ACKs and our acked_tail pointer is too far
4088	* behind, don't overwrite segments. Just drop the packet and
4089	* let the reliability protocol take care of it.
4090	*/
4091	if (!CIRC_SPACE(req->setup_head, req->acked_tail, MAX_FLOWS))
4092	goto ack_done;
4093
4094	/*
4095	* The call to do_rc_ack() should be last in the chain of
4096	* packet checks because it will end up updating the QP state.
4097	* Therefore, anything that would prevent the packet from
4098	* being accepted as a successful response should be prior
4099	* to it.
4100	*/
4101	if (!do_rc_ack(qp, aeth, psn, opcode, val: `0`, rcd))
4102	goto ack_done;
4103
4104	trace_hfi1_ack(qp, psn);
4105
4106	flow = &req->flows[req->setup_head];
4107	flow->pkt = `0`;
4108	flow->tid_idx = `0`;
4109	flow->tid_offset = `0`;
4110	flow->sent = `0`;
4111	flow->resync_npkts = `0`;
4112	flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_qp);
4113	flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) &
4114	TID_RDMA_DESTQP_FLOW_MASK;
4115	flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_psn));
4116	flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
4117	flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK;
4118	flow->flow_state.resp_ib_psn = psn;
4119	flow->length = min_t(u32, req->seg_len,
4120	(wqe->length - (req->comp_seg * req->seg_len)));
4121
4122	flow->npkts = rvt_div_round_up_mtu(qp, len: flow->length);
4123	flow->flow_state.lpsn = flow->flow_state.spsn +
4124	flow->npkts - `1`;
4125	/ payload length = packet length - (header length + ICRC length) /
4126	pktlen = packet->tlen - (packet->hlen + `4`);
4127	if (pktlen > sizeof(flow->tid_entry)) {
4128	status = IB_WC_LOC_LEN_ERR;
4129	goto ack_err;
4130	}
4131	memcpy(flow->tid_entry, packet->ebuf, pktlen);
4132	flow->tidcnt = pktlen / sizeof(*flow->tid_entry);
4133	trace_hfi1_tid_flow_rcv_write_resp(qp, index: req->setup_head, flow);
4134
4135	req->comp_seg++;
4136	trace_hfi1_tid_write_sender_rcv_resp(qp, newreq: `0`);
4137	/*
4138	* Walk the TID_ENTRY list to make sure we have enough space for a
4139	* complete segment.
4140	*/
4141	for (i = `0`; i < flow->tidcnt; i++) {
4142	trace_hfi1_tid_entry_rcv_write_resp(/ entry /
4143	qp, index: i, entry: flow->tid_entry[i]);
4144	if (!EXP_TID_GET(flow->tid_entry[i], LEN)) {
4145	status = IB_WC_LOC_LEN_ERR;
4146	goto ack_err;
4147	}
4148	tidlen += EXP_TID_GET(flow->tid_entry[i], LEN);
4149	}
4150	if (tidlen * PAGE_SIZE < flow->length) {
4151	status = IB_WC_LOC_LEN_ERR;
4152	goto ack_err;
4153	}
4154
4155	trace_hfi1_tid_req_rcv_write_resp(qp, newreq: `0`, opcode: wqe->wr.opcode, psn: wqe->psn,
4156	lpsn: wqe->lpsn, req);
4157	/*
4158	* If this is the first response for this request, set the initial
4159	* flow index to the current flow.
4160	*/
4161	if (!cmp_psn(a: psn, b: wqe->psn)) {
4162	req->r_last_acked = mask_psn(a: wqe->psn - `1`);
4163	/ Set acked flow index to head index /
4164	req->acked_tail = req->setup_head;
4165	}
4166
4167	/ advance circular buffer head /
4168	req->setup_head = CIRC_NEXT(req->setup_head, MAX_FLOWS);
4169	req->state = TID_REQUEST_ACTIVE;
4170
4171	/*
4172	* If all responses for this TID RDMA WRITE request have been received
4173	* advance the pointer to the next one.
4174	* Since TID RDMA requests could be mixed in with regular IB requests,
4175	* they might not appear sequentially in the queue. Therefore, the
4176	* next request needs to be "found".
4177	*/
4178	if (qpriv->s_tid_cur != qpriv->s_tid_head &&
4179	req->comp_seg == req->total_segs) {
4180	for (i = qpriv->s_tid_cur + `1`; ; i++) {
4181	if (i == qp->s_size)
4182	i = `0`;
4183	wqe = rvt_get_swqe_ptr(qp, n: i);
4184	if (i == qpriv->s_tid_head)
4185	break;
4186	if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
4187	break;
4188	}
4189	qpriv->s_tid_cur = i;
4190	}
4191	qp->s_flags &= ~HFI1_S_WAIT_TID_RESP;
4192	hfi1_schedule_tid_send(qp);
4193	goto ack_done;
4194
4195	ack_op_err:
4196	status = IB_WC_LOC_QP_OP_ERR;
4197	ack_err:
4198	rvt_error_qp(qp, err: status);
4199	ack_done:
4200	if (fecn)
4201	qp->s_flags \|= RVT_S_ECN;
4202	spin_unlock_irqrestore(lock: &qp->s_lock, flags);
4203	}
4204
4205	bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe,
4206	struct ib_other_headers *ohdr,
4207	u32 bth1, u32 bth2, u32 *len)
4208	{
4209	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
4210	struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
4211	struct tid_rdma_params *remote;
4212	struct rvt_qp *qp = req->qp;
4213	struct hfi1_qp_priv *qpriv = qp->priv;
4214	u32 tidentry = flow->tid_entry[flow->tid_idx];
4215	u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT;
4216	struct tid_rdma_write_data *wd = &ohdr->u.tid_rdma.w_data;
4217	u32 next_offset, om = KDETH_OM_LARGE;
4218	bool last_pkt;
4219
4220	if (!tidlen) {
4221	hfi1_trdma_send_complete(qp, wqe, status: IB_WC_REM_INV_RD_REQ_ERR);
4222	rvt_error_qp(qp, err: IB_WC_REM_INV_RD_REQ_ERR);
4223	}
4224
4225	*len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset);
4226	flow->sent += *len;
4227	next_offset = flow->tid_offset + *len;
4228	last_pkt = (flow->tid_idx == (flow->tidcnt - `1`) &&
4229	next_offset >= tidlen) \|\| (flow->sent >= flow->length);
4230	trace_hfi1_tid_entry_build_write_data(qp, index: flow->tid_idx, entry: tidentry);
4231	trace_hfi1_tid_flow_build_write_data(qp, index: req->clear_tail, flow);
4232
4233	rcu_read_lock();
4234	remote = rcu_dereference(qpriv->tid_rdma.remote);
4235	KDETH_RESET(wd->kdeth0, KVER, `0x1`);
4236	KDETH_SET(wd->kdeth0, SH, !last_pkt);
4237	KDETH_SET(wd->kdeth0, INTR, !!(!last_pkt && remote->urg));
4238	KDETH_SET(wd->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL));
4239	KDETH_SET(wd->kdeth0, TID, EXP_TID_GET(tidentry, IDX));
4240	KDETH_SET(wd->kdeth0, OM, om == KDETH_OM_LARGE);
4241	KDETH_SET(wd->kdeth0, OFFSET, flow->tid_offset / om);
4242	KDETH_RESET(wd->kdeth1, JKEY, remote->jkey);
4243	wd->verbs_qp = cpu_to_be32(qp->remote_qpn);
4244	rcu_read_unlock();
4245
4246	*bth1 = flow->tid_qpn;
4247	*bth2 = mask_psn(a: ((flow->flow_state.spsn + flow->pkt++) &
4248	HFI1_KDETH_BTH_SEQ_MASK) \|
4249	(flow->flow_state.generation <<
4250	HFI1_KDETH_BTH_SEQ_SHIFT));
4251	if (last_pkt) {
4252	/ PSNs are zero-based, so +1 to count number of packets /
4253	if (flow->flow_state.lpsn + `1` +
4254	rvt_div_round_up_mtu(qp, len: req->seg_len) >
4255	MAX_TID_FLOW_PSN)
4256	req->state = TID_REQUEST_SYNC;
4257	*bth2 \|= IB_BTH_REQ_ACK;
4258	}
4259
4260	if (next_offset >= tidlen) {
4261	flow->tid_offset = `0`;
4262	flow->tid_idx++;
4263	} else {
4264	flow->tid_offset = next_offset;
4265	}
4266	return last_pkt;
4267	}
4268
4269	void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet)
4270	{
4271	struct rvt_qp *qp = packet->qp;
4272	struct hfi1_qp_priv *priv = qp->priv;
4273	struct hfi1_ctxtdata *rcd = priv->rcd;
4274	struct ib_other_headers *ohdr = packet->ohdr;
4275	struct rvt_ack_entry *e;
4276	struct tid_rdma_request *req;
4277	struct tid_rdma_flow *flow;
4278	struct hfi1_ibdev *dev = to_idev(ibdev: qp->ibqp.device);
4279	unsigned long flags;
4280	u32 psn, next;
4281	u8 opcode;
4282	bool fecn;
4283
4284	fecn = process_ecn(qp, pkt: packet);
4285	psn = mask_psn(be32_to_cpu(ohdr->bth[`2`]));
4286	opcode = (be32_to_cpu(ohdr->bth[`0`]) >> `24`) & `0xff`;
4287
4288	/*
4289	* All error handling should be done by now. If we are here, the packet
4290	* is either good or been accepted by the error handler.
4291	*/
4292	spin_lock_irqsave(&qp->s_lock, flags);
4293	e = &qp->s_ack_queue[priv->r_tid_tail];
4294	req = ack_to_tid_req(e);
4295	flow = &req->flows[req->clear_tail];
4296	if (cmp_psn(a: psn, b: full_flow_psn(flow, psn: flow->flow_state.lpsn))) {
4297	update_r_next_psn_fecn(packet, priv, rcd, flow, fecn);
4298
4299	if (cmp_psn(a: psn, b: flow->flow_state.r_next_psn))
4300	goto send_nak;
4301
4302	flow->flow_state.r_next_psn = mask_psn(a: psn + `1`);
4303	/*
4304	* Copy the payload to destination buffer if this packet is
4305	* delivered as an eager packet due to RSM rule and FECN.
4306	* The RSM rule selects FECN bit in BTH and SH bit in
4307	* KDETH header and therefore will not match the last
4308	* packet of each segment that has SH bit cleared.
4309	*/
4310	if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) {
4311	struct rvt_sge_state ss;
4312	u32 len;
4313	u32 tlen = packet->tlen;
4314	u16 hdrsize = packet->hlen;
4315	u8 pad = packet->pad;
4316	u8 extra_bytes = pad + packet->extra_byte +
4317	(SIZE_OF_CRC << `2`);
4318	u32 pmtu = qp->pmtu;
4319
4320	if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
4321	goto send_nak;
4322	len = req->comp_seg * req->seg_len;
4323	len += delta_psn(a: psn,
4324	b: full_flow_psn(flow, psn: flow->flow_state.spsn)) *
4325	pmtu;
4326	if (unlikely(req->total_len - len < pmtu))
4327	goto send_nak;
4328
4329	/*
4330	* The e->rdma_sge field is set when TID RDMA WRITE REQ
4331	* is first received and is never modified thereafter.
4332	*/
4333	ss.sge = e->rdma_sge;
4334	ss.sg_list = NULL;
4335	ss.num_sge = `1`;
4336	ss.total_len = req->total_len;
4337	rvt_skip_sge(ss: &ss, length: len, release: false);
4338	rvt_copy_sge(qp, ss: &ss, data: packet->payload, length: pmtu, release: false,
4339	copy_last: false);
4340	/ Raise the sw sequence check flag for next packet /
4341	priv->r_next_psn_kdeth = mask_psn(a: psn + `1`);
4342	priv->s_flags \|= HFI1_R_TID_SW_PSN;
4343	}
4344	goto exit;
4345	}
4346	flow->flow_state.r_next_psn = mask_psn(a: psn + `1`);
4347	hfi1_kern_exp_rcv_clear(req);
4348	priv->alloc_w_segs--;
4349	rcd->flows[flow->idx].psn = psn & HFI1_KDETH_BTH_SEQ_MASK;
4350	req->comp_seg++;
4351	priv->s_nak_state = `0`;
4352
4353	/*
4354	* Release the flow if one of the following conditions has been met:
4355	* - The request has reached a sync point AND all outstanding
4356	* segments have been completed, or
4357	* - The entire request is complete and there are no more requests
4358	* (of any kind) in the queue.
4359	*/
4360	trace_hfi1_rsp_rcv_tid_write_data(qp, psn);
4361	trace_hfi1_tid_req_rcv_write_data(qp, newreq: `0`, opcode: e->opcode, psn: e->psn, lpsn: e->lpsn,
4362	req);
4363	trace_hfi1_tid_write_rsp_rcv_data(qp);
4364	validate_r_tid_ack(priv);
4365
4366	if (opcode == TID_OP(WRITE_DATA_LAST)) {
4367	release_rdma_sge_mr(e);
4368	for (next = priv->r_tid_tail + `1`; ; next++) {
4369	if (next > rvt_size_atomic(rdi: &dev->rdi))
4370	next = `0`;
4371	if (next == priv->r_tid_head)
4372	break;
4373	e = &qp->s_ack_queue[next];
4374	if (e->opcode == TID_OP(WRITE_REQ))
4375	break;
4376	}
4377	priv->r_tid_tail = next;
4378	if (++qp->s_acked_ack_queue > rvt_size_atomic(rdi: &dev->rdi))
4379	qp->s_acked_ack_queue = `0`;
4380	}
4381
4382	hfi1_tid_write_alloc_resources(qp, intr_ctx: true);
4383
4384	/*
4385	* If we need to generate more responses, schedule the
4386	* send engine.
4387	*/
4388	if (req->cur_seg < req->total_segs \|\|
4389	qp->s_tail_ack_queue != qp->r_head_ack_queue) {
4390	qp->s_flags \|= RVT_S_RESP_PENDING;
4391	hfi1_schedule_send(qp);
4392	}
4393
4394	priv->pending_tid_w_segs--;
4395	if (priv->s_flags & HFI1_R_TID_RSC_TIMER) {
4396	if (priv->pending_tid_w_segs)
4397	hfi1_mod_tid_reap_timer(qp: req->qp);
4398	else
4399	hfi1_stop_tid_reap_timer(qp: req->qp);
4400	}
4401
4402	done:
4403	tid_rdma_schedule_ack(qp);
4404	exit:
4405	priv->r_next_psn_kdeth = flow->flow_state.r_next_psn;
4406	if (fecn)
4407	qp->s_flags \|= RVT_S_ECN;
4408	spin_unlock_irqrestore(lock: &qp->s_lock, flags);
4409	return;
4410
4411	send_nak:
4412	if (!priv->s_nak_state) {
4413	priv->s_nak_state = IB_NAK_PSN_ERROR;
4414	priv->s_nak_psn = flow->flow_state.r_next_psn;
4415	tid_rdma_trigger_ack(qp);
4416	}
4417	goto done;
4418	}
4419
4420	static bool hfi1_tid_rdma_is_resync_psn(u32 psn)
4421	{
4422	return (bool)((psn & HFI1_KDETH_BTH_SEQ_MASK) ==
4423	HFI1_KDETH_BTH_SEQ_MASK);
4424	}
4425
4426	u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp qp, struct* rvt_ack_entry *e,
4427	struct ib_other_headers *ohdr, u16 iflow,
4428	u32 bth1, u32 bth2)
4429	{
4430	struct hfi1_qp_priv *qpriv = qp->priv;
4431	struct tid_flow_state *fs = &qpriv->flow_state;
4432	struct tid_rdma_request *req = ack_to_tid_req(e);
4433	struct tid_rdma_flow *flow = &req->flows[iflow];
4434	struct tid_rdma_params *remote;
4435
4436	rcu_read_lock();
4437	remote = rcu_dereference(qpriv->tid_rdma.remote);
4438	KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey);
4439	ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn);
4440	*bth1 = remote->qp;
4441	rcu_read_unlock();
4442
4443	if (qpriv->resync) {
4444	*bth2 = mask_psn(a: (fs->generation <<
4445	HFI1_KDETH_BTH_SEQ_SHIFT) - `1`);
4446	ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp);
4447	} else if (qpriv->s_nak_state) {
4448	*bth2 = mask_psn(a: qpriv->s_nak_psn);
4449	ohdr->u.tid_rdma.ack.aeth =
4450	cpu_to_be32((qp->r_msn & IB_MSN_MASK) \|
4451	(qpriv->s_nak_state <<
4452	IB_AETH_CREDIT_SHIFT));
4453	} else {
4454	*bth2 = full_flow_psn(flow, psn: flow->flow_state.lpsn);
4455	ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp);
4456	}
4457	KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, `0x1`);
4458	ohdr->u.tid_rdma.ack.tid_flow_qp =
4459	cpu_to_be32(qpriv->tid_rdma.local.qp \|
4460	((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
4461	TID_RDMA_DESTQP_FLOW_SHIFT) \|
4462	qpriv->rcd->ctxt);
4463
4464	ohdr->u.tid_rdma.ack.tid_flow_psn = `0`;
4465	ohdr->u.tid_rdma.ack.verbs_psn =
4466	cpu_to_be32(flow->flow_state.resp_ib_psn);
4467
4468	if (qpriv->resync) {
4469	/*
4470	* If the PSN before the current expect KDETH PSN is the
4471	* RESYNC PSN, then we never received a good TID RDMA WRITE
4472	* DATA packet after a previous RESYNC.
4473	* In this case, the next expected KDETH PSN stays the same.
4474	*/
4475	if (hfi1_tid_rdma_is_resync_psn(psn: qpriv->r_next_psn_kdeth - `1`)) {
4476	ohdr->u.tid_rdma.ack.tid_flow_psn =
4477	cpu_to_be32(qpriv->r_next_psn_kdeth_save);
4478	} else {
4479	/*
4480	* Because the KDETH PSNs jump during a RESYNC, it's
4481	* not possible to infer (or compute) the previous value
4482	* of r_next_psn_kdeth in the case of back-to-back
4483	* RESYNC packets. Therefore, we save it.
4484	*/
4485	qpriv->r_next_psn_kdeth_save =
4486	qpriv->r_next_psn_kdeth - `1`;
4487	ohdr->u.tid_rdma.ack.tid_flow_psn =
4488	cpu_to_be32(qpriv->r_next_psn_kdeth_save);
4489	qpriv->r_next_psn_kdeth = mask_psn(a: *bth2 + `1`);
4490	}
4491	qpriv->resync = false;
4492	}
4493
4494	return sizeof(ohdr->u.tid_rdma.ack) / sizeof(u32);
4495	}
4496
4497	void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet)
4498	{
4499	struct ib_other_headers *ohdr = packet->ohdr;
4500	struct rvt_qp *qp = packet->qp;
4501	struct hfi1_qp_priv *qpriv = qp->priv;
4502	struct rvt_swqe *wqe;
4503	struct tid_rdma_request *req;
4504	struct tid_rdma_flow *flow;
4505	u32 aeth, psn, req_psn, ack_psn, flpsn, resync_psn, ack_kpsn;
4506	unsigned long flags;
4507	u16 fidx;
4508
4509	trace_hfi1_tid_write_sender_rcv_tid_ack(qp, newreq: `0`);
4510	process_ecn(qp, pkt: packet);
4511	psn = mask_psn(be32_to_cpu(ohdr->bth[`2`]));
4512	aeth = be32_to_cpu(ohdr->u.tid_rdma.ack.aeth);
4513	req_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.verbs_psn));
4514	resync_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.tid_flow_psn));
4515
4516	spin_lock_irqsave(&qp->s_lock, flags);
4517	trace_hfi1_rcv_tid_ack(qp, aeth, psn, req_psn, resync_psn);
4518
4519	/ If we are waiting for an ACK to RESYNC, drop any other packets /
4520	if ((qp->s_flags & HFI1_S_WAIT_HALT) &&
4521	cmp_psn(a: psn, b: qpriv->s_resync_psn))
4522	goto ack_op_err;
4523
4524	ack_psn = req_psn;
4525	if (hfi1_tid_rdma_is_resync_psn(psn))
4526	ack_kpsn = resync_psn;
4527	else
4528	ack_kpsn = psn;
4529	if (aeth >> `29`) {
4530	ack_psn--;
4531	ack_kpsn--;
4532	}
4533
4534	if (unlikely(qp->s_acked == qp->s_tail))
4535	goto ack_op_err;
4536
4537	wqe = rvt_get_swqe_ptr(qp, n: qp->s_acked);
4538
4539	if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
4540	goto ack_op_err;
4541
4542	req = wqe_to_tid_req(wqe);
4543	trace_hfi1_tid_req_rcv_tid_ack(qp, newreq: `0`, opcode: wqe->wr.opcode, psn: wqe->psn,
4544	lpsn: wqe->lpsn, req);
4545	flow = &req->flows[req->acked_tail];
4546	trace_hfi1_tid_flow_rcv_tid_ack(qp, index: req->acked_tail, flow);
4547
4548	/ Drop stale ACK/NAK /
4549	if (cmp_psn(a: psn, b: full_flow_psn(flow, psn: flow->flow_state.spsn)) < `0` \|\|
4550	cmp_psn(a: req_psn, b: flow->flow_state.resp_ib_psn) < `0`)
4551	goto ack_op_err;
4552
4553	while (cmp_psn(a: ack_kpsn,
4554	b: full_flow_psn(flow, psn: flow->flow_state.lpsn)) >= `0` &&
4555	req->ack_seg < req->cur_seg) {
4556	req->ack_seg++;
4557	/ advance acked segment pointer /
4558	req->acked_tail = CIRC_NEXT(req->acked_tail, MAX_FLOWS);
4559	req->r_last_acked = flow->flow_state.resp_ib_psn;
4560	trace_hfi1_tid_req_rcv_tid_ack(qp, newreq: `0`, opcode: wqe->wr.opcode, psn: wqe->psn,
4561	lpsn: wqe->lpsn, req);
4562	if (req->ack_seg == req->total_segs) {
4563	req->state = TID_REQUEST_COMPLETE;
4564	wqe = do_rc_completion(qp, wqe,
4565	ibp: to_iport(ibdev: qp->ibqp.device,
4566	port: qp->port_num));
4567	trace_hfi1_sender_rcv_tid_ack(qp);
4568	atomic_dec(v: &qpriv->n_tid_requests);
4569	if (qp->s_acked == qp->s_tail)
4570	break;
4571	if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
4572	break;
4573	req = wqe_to_tid_req(wqe);
4574	}
4575	flow = &req->flows[req->acked_tail];
4576	trace_hfi1_tid_flow_rcv_tid_ack(qp, index: req->acked_tail, flow);
4577	}
4578
4579	trace_hfi1_tid_req_rcv_tid_ack(qp, newreq: `0`, opcode: wqe->wr.opcode, psn: wqe->psn,
4580	lpsn: wqe->lpsn, req);
4581	switch (aeth >> `29`) {
4582	case `0`: / ACK /
4583	if (qpriv->s_flags & RVT_S_WAIT_ACK)
4584	qpriv->s_flags &= ~RVT_S_WAIT_ACK;
4585	if (!hfi1_tid_rdma_is_resync_psn(psn)) {
4586	/ Check if there is any pending TID ACK /
4587	if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
4588	req->ack_seg < req->cur_seg)
4589	hfi1_mod_tid_retry_timer(qp);
4590	else
4591	hfi1_stop_tid_retry_timer(qp);
4592	hfi1_schedule_send(qp);
4593	} else {
4594	u32 spsn, fpsn, last_acked, generation;
4595	struct tid_rdma_request *rptr;
4596
4597	/ ACK(RESYNC) /
4598	hfi1_stop_tid_retry_timer(qp);
4599	/ Allow new requests (see hfi1_make_tid_rdma_pkt) /
4600	qp->s_flags &= ~HFI1_S_WAIT_HALT;
4601	/*
4602	* Clear RVT_S_SEND_ONE flag in case that the TID RDMA
4603	* ACK is received after the TID retry timer is fired
4604	* again. In this case, do not send any more TID
4605	* RESYNC request or wait for any more TID ACK packet.
4606	*/
4607	qpriv->s_flags &= ~RVT_S_SEND_ONE;
4608	hfi1_schedule_send(qp);
4609
4610	if ((qp->s_acked == qpriv->s_tid_tail &&
4611	req->ack_seg == req->total_segs) \|\|
4612	qp->s_acked == qp->s_tail) {
4613	qpriv->s_state = TID_OP(WRITE_DATA_LAST);
4614	goto done;
4615	}
4616
4617	if (req->ack_seg == req->comp_seg) {
4618	qpriv->s_state = TID_OP(WRITE_DATA);
4619	goto done;
4620	}
4621
4622	/*
4623	* The PSN to start with is the next PSN after the
4624	* RESYNC PSN.
4625	*/
4626	psn = mask_psn(a: psn + `1`);
4627	generation = psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
4628	spsn = `0`;
4629
4630	/*
4631	* Update to the correct WQE when we get an ACK(RESYNC)
4632	* in the middle of a request.
4633	*/
4634	if (delta_psn(a: ack_psn, b: wqe->lpsn))
4635	wqe = rvt_get_swqe_ptr(qp, n: qp->s_acked);
4636	req = wqe_to_tid_req(wqe);
4637	flow = &req->flows[req->acked_tail];
4638	/*
4639	* RESYNC re-numbers the PSN ranges of all remaining
4640	* segments. Also, PSN's start from 0 in the middle of a
4641	* segment and the first segment size is less than the
4642	* default number of packets. flow->resync_npkts is used
4643	* to track the number of packets from the start of the
4644	* real segment to the point of 0 PSN after the RESYNC
4645	* in order to later correctly rewind the SGE.
4646	*/
4647	fpsn = full_flow_psn(flow, psn: flow->flow_state.spsn);
4648	req->r_ack_psn = psn;
4649	/*
4650	* If resync_psn points to the last flow PSN for a
4651	* segment and the new segment (likely from a new
4652	* request) starts with a new generation number, we
4653	* need to adjust resync_psn accordingly.
4654	*/
4655	if (flow->flow_state.generation !=
4656	(resync_psn >> HFI1_KDETH_BTH_SEQ_SHIFT))
4657	resync_psn = mask_psn(a: fpsn - `1`);
4658	flow->resync_npkts +=
4659	delta_psn(a: mask_psn(a: resync_psn + `1`), b: fpsn);
4660	/*
4661	* Renumber all packet sequence number ranges
4662	* based on the new generation.
4663	*/
4664	last_acked = qp->s_acked;
4665	rptr = req;
4666	while (`1`) {
4667	/ start from last acked segment /
4668	for (fidx = rptr->acked_tail;
4669	CIRC_CNT(rptr->setup_head, fidx,
4670	MAX_FLOWS);
4671	fidx = CIRC_NEXT(fidx, MAX_FLOWS)) {
4672	u32 lpsn;
4673	u32 gen;
4674
4675	flow = &rptr->flows[fidx];
4676	gen = flow->flow_state.generation;
4677	if (WARN_ON(gen == generation &&
4678	flow->flow_state.spsn !=
4679	spsn))
4680	continue;
4681	lpsn = flow->flow_state.lpsn;
4682	lpsn = full_flow_psn(flow, psn: lpsn);
4683	flow->npkts =
4684	delta_psn(a: lpsn,
4685	b: mask_psn(a: resync_psn)
4686	);
4687	flow->flow_state.generation =
4688	generation;
4689	flow->flow_state.spsn = spsn;
4690	flow->flow_state.lpsn =
4691	flow->flow_state.spsn +
4692	flow->npkts - `1`;
4693	flow->pkt = `0`;
4694	spsn += flow->npkts;
4695	resync_psn += flow->npkts;
4696	trace_hfi1_tid_flow_rcv_tid_ack(qp,
4697	index: fidx,
4698	flow);
4699	}
4700	if (++last_acked == qpriv->s_tid_cur + `1`)
4701	break;
4702	if (last_acked == qp->s_size)
4703	last_acked = `0`;
4704	wqe = rvt_get_swqe_ptr(qp, n: last_acked);
4705	rptr = wqe_to_tid_req(wqe);
4706	}
4707	req->cur_seg = req->ack_seg;
4708	qpriv->s_tid_tail = qp->s_acked;
4709	qpriv->s_state = TID_OP(WRITE_REQ);
4710	hfi1_schedule_tid_send(qp);
4711	}
4712	done:
4713	qpriv->s_retry = qp->s_retry_cnt;
4714	break;
4715
4716	case `3`: / NAK /
4717	hfi1_stop_tid_retry_timer(qp);
4718	switch ((aeth >> IB_AETH_CREDIT_SHIFT) &
4719	IB_AETH_CREDIT_MASK) {
4720	case `0`: / PSN sequence error /
4721	if (!req->flows)
4722	break;
4723	flow = &req->flows[req->acked_tail];
4724	flpsn = full_flow_psn(flow, psn: flow->flow_state.lpsn);
4725	if (cmp_psn(a: psn, b: flpsn) > `0`)
4726	break;
4727	trace_hfi1_tid_flow_rcv_tid_ack(qp, index: req->acked_tail,
4728	flow);
4729	req->r_ack_psn = mask_psn(be32_to_cpu(ohdr->bth[`2`]));
4730	req->cur_seg = req->ack_seg;
4731	qpriv->s_tid_tail = qp->s_acked;
4732	qpriv->s_state = TID_OP(WRITE_REQ);
4733	qpriv->s_retry = qp->s_retry_cnt;
4734	hfi1_schedule_tid_send(qp);
4735	break;
4736
4737	default:
4738	break;
4739	}
4740	break;
4741
4742	default:
4743	break;
4744	}
4745
4746	ack_op_err:
4747	spin_unlock_irqrestore(lock: &qp->s_lock, flags);
4748	}
4749
4750	void hfi1_add_tid_retry_timer(struct rvt_qp *qp)
4751	{
4752	struct hfi1_qp_priv *priv = qp->priv;
4753	struct ib_qp *ibqp = &qp->ibqp;
4754	struct rvt_dev_info *rdi = ib_to_rvt(ibdev: ibqp->device);
4755
4756	lockdep_assert_held(&qp->s_lock);
4757	if (!(priv->s_flags & HFI1_S_TID_RETRY_TIMER)) {
4758	priv->s_flags \|= HFI1_S_TID_RETRY_TIMER;
4759	priv->s_tid_retry_timer.expires = jiffies +
4760	priv->tid_retry_timeout_jiffies + rdi->busy_jiffies;
4761	add_timer(timer: &priv->s_tid_retry_timer);
4762	}
4763	}
4764
4765	static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp)
4766	{
4767	struct hfi1_qp_priv *priv = qp->priv;
4768	struct ib_qp *ibqp = &qp->ibqp;
4769	struct rvt_dev_info *rdi = ib_to_rvt(ibdev: ibqp->device);
4770
4771	lockdep_assert_held(&qp->s_lock);
4772	priv->s_flags \|= HFI1_S_TID_RETRY_TIMER;
4773	mod_timer(timer: &priv->s_tid_retry_timer, expires: jiffies +
4774	priv->tid_retry_timeout_jiffies + rdi->busy_jiffies);
4775	}
4776
4777	static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp)
4778	{
4779	struct hfi1_qp_priv *priv = qp->priv;
4780	int rval = `0`;
4781
4782	lockdep_assert_held(&qp->s_lock);
4783	if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) {
4784	rval = del_timer(timer: &priv->s_tid_retry_timer);
4785	priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER;
4786	}
4787	return rval;
4788	}
4789
4790	void hfi1_del_tid_retry_timer(struct rvt_qp *qp)
4791	{
4792	struct hfi1_qp_priv *priv = qp->priv;
4793
4794	del_timer_sync(timer: &priv->s_tid_retry_timer);
4795	priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER;
4796	}
4797
4798	static void hfi1_tid_retry_timeout(struct timer_list *t)
4799	{
4800	struct hfi1_qp_priv *priv = from_timer(priv, t, s_tid_retry_timer);
4801	struct rvt_qp *qp = priv->owner;
4802	struct rvt_swqe *wqe;
4803	unsigned long flags;
4804	struct tid_rdma_request *req;
4805
4806	spin_lock_irqsave(&qp->r_lock, flags);
4807	spin_lock(lock: &qp->s_lock);
4808	trace_hfi1_tid_write_sender_retry_timeout(qp, newreq: `0`);
4809	if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) {
4810	hfi1_stop_tid_retry_timer(qp);
4811	if (!priv->s_retry) {
4812	trace_hfi1_msg_tid_retry_timeout(/ msg /
4813	qp,
4814	msg: "Exhausted retries. Tid retry timeout = ",
4815	more: (u64)priv->tid_retry_timeout_jiffies);
4816
4817	wqe = rvt_get_swqe_ptr(qp, n: qp->s_acked);
4818	hfi1_trdma_send_complete(qp, wqe, status: IB_WC_RETRY_EXC_ERR);
4819	rvt_error_qp(qp, err: IB_WC_WR_FLUSH_ERR);
4820	} else {
4821	wqe = rvt_get_swqe_ptr(qp, n: qp->s_acked);
4822	req = wqe_to_tid_req(wqe);
4823	trace_hfi1_tid_req_tid_retry_timeout(/ req /
4824	qp, newreq: `0`, opcode: wqe->wr.opcode, psn: wqe->psn, lpsn: wqe->lpsn, req);
4825
4826	priv->s_flags &= ~RVT_S_WAIT_ACK;
4827	/ Only send one packet (the RESYNC) /
4828	priv->s_flags \|= RVT_S_SEND_ONE;
4829	/*
4830	* No additional request shall be made by this QP until
4831	* the RESYNC has been complete.
4832	*/
4833	qp->s_flags \|= HFI1_S_WAIT_HALT;
4834	priv->s_state = TID_OP(RESYNC);
4835	priv->s_retry--;
4836	hfi1_schedule_tid_send(qp);
4837	}
4838	}
4839	spin_unlock(lock: &qp->s_lock);
4840	spin_unlock_irqrestore(lock: &qp->r_lock, flags);
4841	}
4842
4843	u32 hfi1_build_tid_rdma_resync(struct rvt_qp qp, struct* rvt_swqe *wqe,
4844	struct ib_other_headers ohdr, u32 bth1,
4845	u32 *bth2, u16 fidx)
4846	{
4847	struct hfi1_qp_priv *qpriv = qp->priv;
4848	struct tid_rdma_params *remote;
4849	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
4850	struct tid_rdma_flow *flow = &req->flows[fidx];
4851	u32 generation;
4852
4853	rcu_read_lock();
4854	remote = rcu_dereference(qpriv->tid_rdma.remote);
4855	KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey);
4856	ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn);
4857	*bth1 = remote->qp;
4858	rcu_read_unlock();
4859
4860	generation = kern_flow_generation_next(gen: flow->flow_state.generation);
4861	*bth2 = mask_psn(a: (generation << HFI1_KDETH_BTH_SEQ_SHIFT) - `1`);
4862	qpriv->s_resync_psn = *bth2;
4863	*bth2 \|= IB_BTH_REQ_ACK;
4864	KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, `0x1`);
4865
4866	return sizeof(ohdr->u.tid_rdma.resync) / sizeof(u32);
4867	}
4868
4869	void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet)
4870	{
4871	struct ib_other_headers *ohdr = packet->ohdr;
4872	struct rvt_qp *qp = packet->qp;
4873	struct hfi1_qp_priv *qpriv = qp->priv;
4874	struct hfi1_ctxtdata *rcd = qpriv->rcd;
4875	struct hfi1_ibdev *dev = to_idev(ibdev: qp->ibqp.device);
4876	struct rvt_ack_entry *e;
4877	struct tid_rdma_request *req;
4878	struct tid_rdma_flow *flow;
4879	struct tid_flow_state *fs = &qpriv->flow_state;
4880	u32 psn, generation, idx, gen_next;
4881	bool fecn;
4882	unsigned long flags;
4883
4884	fecn = process_ecn(qp, pkt: packet);
4885	psn = mask_psn(be32_to_cpu(ohdr->bth[`2`]));
4886
4887	generation = mask_psn(a: psn + `1`) >> HFI1_KDETH_BTH_SEQ_SHIFT;
4888	spin_lock_irqsave(&qp->s_lock, flags);
4889
4890	gen_next = (fs->generation == KERN_GENERATION_RESERVED) ?
4891	generation : kern_flow_generation_next(gen: fs->generation);
4892	/*
4893	* RESYNC packet contains the "next" generation and can only be
4894	* from the current or previous generations
4895	*/
4896	if (generation != mask_generation(a: gen_next - `1`) &&
4897	generation != gen_next)
4898	goto bail;
4899	/ Already processing a resync /
4900	if (qpriv->resync)
4901	goto bail;
4902
4903	spin_lock(lock: &rcd->exp_lock);
4904	if (fs->index >= RXE_NUM_TID_FLOWS) {
4905	/*
4906	* If we don't have a flow, save the generation so it can be
4907	* applied when a new flow is allocated
4908	*/
4909	fs->generation = generation;
4910	} else {
4911	/ Reprogram the QP flow with new generation /
4912	rcd->flows[fs->index].generation = generation;
4913	fs->generation = kern_setup_hw_flow(rcd, flow_idx: fs->index);
4914	}
4915	fs->psn = `0`;
4916	/*
4917	* Disable SW PSN checking since a RESYNC is equivalent to a
4918	* sync point and the flow has/will be reprogrammed
4919	*/
4920	qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
4921	trace_hfi1_tid_write_rsp_rcv_resync(qp);
4922
4923	/*
4924	* Reset all TID flow information with the new generation.
4925	* This is done for all requests and segments after the
4926	* last received segment
4927	*/
4928	for (idx = qpriv->r_tid_tail; ; idx++) {
4929	u16 flow_idx;
4930
4931	if (idx > rvt_size_atomic(rdi: &dev->rdi))
4932	idx = `0`;
4933	e = &qp->s_ack_queue[idx];
4934	if (e->opcode == TID_OP(WRITE_REQ)) {
4935	req = ack_to_tid_req(e);
4936	trace_hfi1_tid_req_rcv_resync(qp, newreq: `0`, opcode: e->opcode, psn: e->psn,
4937	lpsn: e->lpsn, req);
4938
4939	/ start from last unacked segment /
4940	for (flow_idx = req->clear_tail;
4941	CIRC_CNT(req->setup_head, flow_idx,
4942	MAX_FLOWS);
4943	flow_idx = CIRC_NEXT(flow_idx, MAX_FLOWS)) {
4944	u32 lpsn;
4945	u32 next;
4946
4947	flow = &req->flows[flow_idx];
4948	lpsn = full_flow_psn(flow,
4949	psn: flow->flow_state.lpsn);
4950	next = flow->flow_state.r_next_psn;
4951	flow->npkts = delta_psn(a: lpsn, b: next - `1`);
4952	flow->flow_state.generation = fs->generation;
4953	flow->flow_state.spsn = fs->psn;
4954	flow->flow_state.lpsn =
4955	flow->flow_state.spsn + flow->npkts - `1`;
4956	flow->flow_state.r_next_psn =
4957	full_flow_psn(flow,
4958	psn: flow->flow_state.spsn);
4959	fs->psn += flow->npkts;
4960	trace_hfi1_tid_flow_rcv_resync(qp, index: flow_idx,
4961	flow);
4962	}
4963	}
4964	if (idx == qp->s_tail_ack_queue)
4965	break;
4966	}
4967
4968	spin_unlock(lock: &rcd->exp_lock);
4969	qpriv->resync = true;
4970	/ RESYNC request always gets a TID RDMA ACK. /
4971	qpriv->s_nak_state = `0`;
4972	tid_rdma_trigger_ack(qp);
4973	bail:
4974	if (fecn)
4975	qp->s_flags \|= RVT_S_ECN;
4976	spin_unlock_irqrestore(lock: &qp->s_lock, flags);
4977	}
4978
4979	/*
4980	* Call this function when the last TID RDMA WRITE DATA packet for a request
4981	* is built.
4982	*/
4983	static void update_tid_tail(struct rvt_qp *qp)
4984	__must_hold(&qp->s_lock)
4985	{
4986	struct hfi1_qp_priv *priv = qp->priv;
4987	u32 i;
4988	struct rvt_swqe *wqe;
4989
4990	lockdep_assert_held(&qp->s_lock);
4991	/ Can't move beyond s_tid_cur /
4992	if (priv->s_tid_tail == priv->s_tid_cur)
4993	return;
4994	for (i = priv->s_tid_tail + `1`; ; i++) {
4995	if (i == qp->s_size)
4996	i = `0`;
4997
4998	if (i == priv->s_tid_cur)
4999	break;
5000	wqe = rvt_get_swqe_ptr(qp, n: i);
5001	if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
5002	break;
5003	}
5004	priv->s_tid_tail = i;
5005	priv->s_state = TID_OP(WRITE_RESP);
5006	}
5007
5008	int hfi1_make_tid_rdma_pkt(struct rvt_qp qp, struct* hfi1_pkt_state *ps)
5009	__must_hold(&qp->s_lock)
5010	{
5011	struct hfi1_qp_priv *priv = qp->priv;
5012	struct rvt_swqe *wqe;
5013	u32 bth1 = `0`, bth2 = `0`, hwords = `5`, len, middle = `0`;
5014	struct ib_other_headers *ohdr;
5015	struct rvt_sge_state *ss = &qp->s_sge;
5016	struct rvt_ack_entry *e = &qp->s_ack_queue[qp->s_tail_ack_queue];
5017	struct tid_rdma_request *req = ack_to_tid_req(e);
5018	bool last = false;
5019	u8 opcode = TID_OP(WRITE_DATA);
5020
5021	lockdep_assert_held(&qp->s_lock);
5022	trace_hfi1_tid_write_sender_make_tid_pkt(qp, newreq: `0`);
5023	/*
5024	* Prioritize the sending of the requests and responses over the
5025	* sending of the TID RDMA data packets.
5026	*/
5027	if (((atomic_read(v: &priv->n_tid_requests) < HFI1_TID_RDMA_WRITE_CNT) &&
5028	atomic_read(v: &priv->n_requests) &&
5029	!(qp->s_flags & (RVT_S_BUSY \| RVT_S_WAIT_ACK \|
5030	HFI1_S_ANY_WAIT_IO))) \|\|
5031	(e->opcode == TID_OP(WRITE_REQ) && req->cur_seg < req->alloc_seg &&
5032	!(qp->s_flags & (RVT_S_BUSY \| HFI1_S_ANY_WAIT_IO)))) {
5033	struct iowait_work *iowork;
5034
5035	iowork = iowait_get_ib_work(w: &priv->s_iowait);
5036	ps->s_txreq = get_waiting_verbs_txreq(w: iowork);
5037	if (ps->s_txreq \|\| hfi1_make_rc_req(qp, ps)) {
5038	priv->s_flags \|= HFI1_S_TID_BUSY_SET;
5039	return `1`;
5040	}
5041	}
5042
5043	ps->s_txreq = get_txreq(dev: ps->dev, qp);
5044	if (!ps->s_txreq)
5045	goto bail_no_tx;
5046
5047	ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth;
5048
5049	if ((priv->s_flags & RVT_S_ACK_PENDING) &&
5050	make_tid_rdma_ack(qp, ohdr, ps))
5051	return `1`;
5052
5053	/*
5054	* Bail out if we can't send data.
5055	* Be reminded that this check must been done after the call to
5056	* make_tid_rdma_ack() because the responding QP could be in
5057	* RTR state where it can send TID RDMA ACK, not TID RDMA WRITE DATA.
5058	*/
5059	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK))
5060	goto bail;
5061
5062	if (priv->s_flags & RVT_S_WAIT_ACK)
5063	goto bail;
5064
5065	/ Check whether there is anything to do. /
5066	if (priv->s_tid_tail == HFI1_QP_WQE_INVALID)
5067	goto bail;
5068	wqe = rvt_get_swqe_ptr(qp, n: priv->s_tid_tail);
5069	req = wqe_to_tid_req(wqe);
5070	trace_hfi1_tid_req_make_tid_pkt(qp, newreq: `0`, opcode: wqe->wr.opcode, psn: wqe->psn,
5071	lpsn: wqe->lpsn, req);
5072	switch (priv->s_state) {
5073	case TID_OP(WRITE_REQ):
5074	case TID_OP(WRITE_RESP):
5075	priv->tid_ss.sge = wqe->sg_list[`0`];
5076	priv->tid_ss.sg_list = wqe->sg_list + `1`;
5077	priv->tid_ss.num_sge = wqe->wr.num_sge;
5078	priv->tid_ss.total_len = wqe->length;
5079
5080	if (priv->s_state == TID_OP(WRITE_REQ))
5081	hfi1_tid_rdma_restart_req(qp, wqe, bth2: &bth2);
5082	priv->s_state = TID_OP(WRITE_DATA);
5083	fallthrough;
5084
5085	case TID_OP(WRITE_DATA):
5086	/*
5087	* 1. Check whether TID RDMA WRITE RESP available.
5088	* 2. If no:
5089	* 2.1 If have more segments and no TID RDMA WRITE RESP,
5090	* set HFI1_S_WAIT_TID_RESP
5091	* 2.2 Return indicating no progress made.
5092	* 3. If yes:
5093	* 3.1 Build TID RDMA WRITE DATA packet.
5094	* 3.2 If last packet in segment:
5095	* 3.2.1 Change KDETH header bits
5096	* 3.2.2 Advance RESP pointers.
5097	* 3.3 Return indicating progress made.
5098	*/
5099	trace_hfi1_sender_make_tid_pkt(qp);
5100	trace_hfi1_tid_write_sender_make_tid_pkt(qp, newreq: `0`);
5101	wqe = rvt_get_swqe_ptr(qp, n: priv->s_tid_tail);
5102	req = wqe_to_tid_req(wqe);
5103	len = wqe->length;
5104
5105	if (!req->comp_seg \|\| req->cur_seg == req->comp_seg)
5106	goto bail;
5107
5108	trace_hfi1_tid_req_make_tid_pkt(qp, newreq: `0`, opcode: wqe->wr.opcode,
5109	psn: wqe->psn, lpsn: wqe->lpsn, req);
5110	last = hfi1_build_tid_rdma_packet(wqe, ohdr, bth1: &bth1, bth2: &bth2,
5111	len: &len);
5112
5113	if (last) {
5114	/ move pointer to next flow /
5115	req->clear_tail = CIRC_NEXT(req->clear_tail,
5116	MAX_FLOWS);
5117	if (++req->cur_seg < req->total_segs) {
5118	if (!CIRC_CNT(req->setup_head, req->clear_tail,
5119	MAX_FLOWS))
5120	qp->s_flags \|= HFI1_S_WAIT_TID_RESP;
5121	} else {
5122	priv->s_state = TID_OP(WRITE_DATA_LAST);
5123	opcode = TID_OP(WRITE_DATA_LAST);
5124
5125	/ Advance the s_tid_tail now /
5126	update_tid_tail(qp);
5127	}
5128	}
5129	hwords += sizeof(ohdr->u.tid_rdma.w_data) / sizeof(u32);
5130	ss = &priv->tid_ss;
5131	break;
5132
5133	case TID_OP(RESYNC):
5134	trace_hfi1_sender_make_tid_pkt(qp);
5135	/ Use generation from the most recently received response /
5136	wqe = rvt_get_swqe_ptr(qp, n: priv->s_tid_cur);
5137	req = wqe_to_tid_req(wqe);
5138	/ If no responses for this WQE look at the previous one /
5139	if (!req->comp_seg) {
5140	wqe = rvt_get_swqe_ptr(qp,
5141	n: (!priv->s_tid_cur ? qp->s_size :
5142	priv->s_tid_cur) - `1`);
5143	req = wqe_to_tid_req(wqe);
5144	}
5145	hwords += hfi1_build_tid_rdma_resync(qp, wqe, ohdr, bth1: &bth1,
5146	bth2: &bth2,
5147	CIRC_PREV(req->setup_head,
5148	MAX_FLOWS));
5149	ss = NULL;
5150	len = `0`;
5151	opcode = TID_OP(RESYNC);
5152	break;
5153
5154	default:
5155	goto bail;
5156	}
5157	if (priv->s_flags & RVT_S_SEND_ONE) {
5158	priv->s_flags &= ~RVT_S_SEND_ONE;
5159	priv->s_flags \|= RVT_S_WAIT_ACK;
5160	bth2 \|= IB_BTH_REQ_ACK;
5161	}
5162	qp->s_len -= len;
5163	ps->s_txreq->hdr_dwords = hwords;
5164	ps->s_txreq->sde = priv->s_sde;
5165	ps->s_txreq->ss = ss;
5166	ps->s_txreq->s_cur_size = len;
5167	hfi1_make_ruc_header(qp, ohdr, bth0: (opcode << `24`), bth1, bth2,
5168	middle, ps);
5169	return `1`;
5170	bail:
5171	hfi1_put_txreq(tx: ps->s_txreq);
5172	bail_no_tx:
5173	ps->s_txreq = NULL;
5174	priv->s_flags &= ~RVT_S_BUSY;
5175	/*
5176	* If we didn't get a txreq, the QP will be woken up later to try
5177	* again, set the flags to the wake up which work item to wake
5178	* up.
5179	* (A better algorithm should be found to do this and generalize the
5180	* sleep/wakeup flags.)
5181	*/
5182	iowait_set_flag(wait: &priv->s_iowait, IOWAIT_PENDING_TID);
5183	return `0`;
5184	}
5185
5186	static int make_tid_rdma_ack(struct rvt_qp *qp,
5187	struct ib_other_headers *ohdr,
5188	struct hfi1_pkt_state *ps)
5189	{
5190	struct rvt_ack_entry *e;
5191	struct hfi1_qp_priv *qpriv = qp->priv;
5192	struct hfi1_ibdev *dev = to_idev(ibdev: qp->ibqp.device);
5193	u32 hwords, next;
5194	u32 len = `0`;
5195	u32 bth1 = `0`, bth2 = `0`;
5196	int middle = `0`;
5197	u16 flow;
5198	struct tid_rdma_request req, nreq;
5199
5200	trace_hfi1_tid_write_rsp_make_tid_ack(qp);
5201	/ Don't send an ACK if we aren't supposed to. /
5202	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
5203	goto bail;
5204
5205	/ header size in 32-bit words LRH+BTH = (8+12)/4. /
5206	hwords = `5`;
5207
5208	e = &qp->s_ack_queue[qpriv->r_tid_ack];
5209	req = ack_to_tid_req(e);
5210	/*
5211	* In the RESYNC case, we are exactly one segment past the
5212	* previously sent ack or at the previously sent NAK. So to send
5213	* the resync ack, we go back one segment (which might be part of
5214	* the previous request) and let the do-while loop execute again.
5215	* The advantage of executing the do-while loop is that any data
5216	* received after the previous ack is automatically acked in the
5217	* RESYNC ack. It turns out that for the do-while loop we only need
5218	* to pull back qpriv->r_tid_ack, not the segment
5219	* indices/counters. The scheme works even if the previous request
5220	* was not a TID WRITE request.
5221	*/
5222	if (qpriv->resync) {
5223	if (!req->ack_seg \|\| req->ack_seg == req->total_segs)
5224	qpriv->r_tid_ack = !qpriv->r_tid_ack ?
5225	rvt_size_atomic(rdi: &dev->rdi) :
5226	qpriv->r_tid_ack - `1`;
5227	e = &qp->s_ack_queue[qpriv->r_tid_ack];
5228	req = ack_to_tid_req(e);
5229	}
5230
5231	trace_hfi1_rsp_make_tid_ack(qp, psn: e->psn);
5232	trace_hfi1_tid_req_make_tid_ack(qp, newreq: `0`, opcode: e->opcode, psn: e->psn, lpsn: e->lpsn,
5233	req);
5234	/*
5235	* If we've sent all the ACKs that we can, we are done
5236	* until we get more segments...
5237	*/
5238	if (!qpriv->s_nak_state && !qpriv->resync &&
5239	req->ack_seg == req->comp_seg)
5240	goto bail;
5241
5242	do {
5243	/*
5244	* To deal with coalesced ACKs, the acked_tail pointer
5245	* into the flow array is used. The distance between it
5246	* and the clear_tail is the number of flows that are
5247	* being ACK'ed.
5248	*/
5249	req->ack_seg +=
5250	/ Get up-to-date value /
5251	CIRC_CNT(req->clear_tail, req->acked_tail,
5252	MAX_FLOWS);
5253	/ Advance acked index /
5254	req->acked_tail = req->clear_tail;
5255
5256	/*
5257	* req->clear_tail points to the segment currently being
5258	* received. So, when sending an ACK, the previous
5259	* segment is being ACK'ed.
5260	*/
5261	flow = CIRC_PREV(req->acked_tail, MAX_FLOWS);
5262	if (req->ack_seg != req->total_segs)
5263	break;
5264	req->state = TID_REQUEST_COMPLETE;
5265
5266	next = qpriv->r_tid_ack + `1`;
5267	if (next > rvt_size_atomic(rdi: &dev->rdi))
5268	next = `0`;
5269	qpriv->r_tid_ack = next;
5270	if (qp->s_ack_queue[next].opcode != TID_OP(WRITE_REQ))
5271	break;
5272	nreq = ack_to_tid_req(e: &qp->s_ack_queue[next]);
5273	if (!nreq->comp_seg \|\| nreq->ack_seg == nreq->comp_seg)
5274	break;
5275
5276	/ Move to the next ack entry now /
5277	e = &qp->s_ack_queue[qpriv->r_tid_ack];
5278	req = ack_to_tid_req(e);
5279	} while (`1`);
5280
5281	/*
5282	* At this point qpriv->r_tid_ack == qpriv->r_tid_tail but e and
5283	* req could be pointing at the previous ack queue entry
5284	*/
5285	if (qpriv->s_nak_state \|\|
5286	(qpriv->resync &&
5287	!hfi1_tid_rdma_is_resync_psn(psn: qpriv->r_next_psn_kdeth - `1`) &&
5288	(cmp_psn(a: qpriv->r_next_psn_kdeth - `1`,
5289	b: full_flow_psn(flow: &req->flows[flow],
5290	psn: req->flows[flow].flow_state.lpsn)) > `0`))) {
5291	/*
5292	* A NAK will implicitly acknowledge all previous TID RDMA
5293	* requests. Therefore, we NAK with the req->acked_tail
5294	* segment for the request at qpriv->r_tid_ack (same at
5295	* this point as the req->clear_tail segment for the
5296	* qpriv->r_tid_tail request)
5297	*/
5298	e = &qp->s_ack_queue[qpriv->r_tid_ack];
5299	req = ack_to_tid_req(e);
5300	flow = req->acked_tail;
5301	} else if (req->ack_seg == req->total_segs &&
5302	qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK)
5303	qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK;
5304
5305	trace_hfi1_tid_write_rsp_make_tid_ack(qp);
5306	trace_hfi1_tid_req_make_tid_ack(qp, newreq: `0`, opcode: e->opcode, psn: e->psn, lpsn: e->lpsn,
5307	req);
5308	hwords += hfi1_build_tid_rdma_write_ack(qp, e, ohdr, iflow: flow, bth1: &bth1,
5309	bth2: &bth2);
5310	len = `0`;
5311	qpriv->s_flags &= ~RVT_S_ACK_PENDING;
5312	ps->s_txreq->hdr_dwords = hwords;
5313	ps->s_txreq->sde = qpriv->s_sde;
5314	ps->s_txreq->s_cur_size = len;
5315	ps->s_txreq->ss = NULL;
5316	hfi1_make_ruc_header(qp, ohdr, bth0: (TID_OP(ACK) << `24`), bth1, bth2, middle,
5317	ps);
5318	ps->s_txreq->txreq.flags \|= SDMA_TXREQ_F_VIP;
5319	return `1`;
5320	bail:
5321	/*
5322	* Ensure s_rdma_ack_cnt changes are committed prior to resetting
5323	* RVT_S_RESP_PENDING
5324	*/
5325	smp_wmb();
5326	qpriv->s_flags &= ~RVT_S_ACK_PENDING;
5327	return `0`;
5328	}
5329
5330	static int hfi1_send_tid_ok(struct rvt_qp *qp)
5331	{
5332	struct hfi1_qp_priv *priv = qp->priv;
5333
5334	return !(priv->s_flags & RVT_S_BUSY \|\|
5335	qp->s_flags & HFI1_S_ANY_WAIT_IO) &&
5336	(verbs_txreq_queued(w: iowait_get_tid_work(w: &priv->s_iowait)) \|\|
5337	(priv->s_flags & RVT_S_RESP_PENDING) \|\|
5338	!(qp->s_flags & HFI1_S_ANY_TID_WAIT_SEND));
5339	}
5340
5341	void _hfi1_do_tid_send(struct work_struct *work)
5342	{
5343	struct iowait_work w = container_of(work, struct* iowait_work, iowork);
5344	struct rvt_qp *qp = iowait_to_qp(s_iowait: w->iow);
5345
5346	hfi1_do_tid_send(qp);
5347	}
5348
5349	static void hfi1_do_tid_send(struct rvt_qp *qp)
5350	{
5351	struct hfi1_pkt_state ps;
5352	struct hfi1_qp_priv *priv = qp->priv;
5353
5354	ps.dev = to_idev(ibdev: qp->ibqp.device);
5355	ps.ibp = to_iport(ibdev: qp->ibqp.device, port: qp->port_num);
5356	ps.ppd = ppd_from_ibp(ibp: ps.ibp);
5357	ps.wait = iowait_get_tid_work(w: &priv->s_iowait);
5358	ps.in_thread = false;
5359	ps.timeout_int = qp->timeout_jiffies / `8`;
5360
5361	trace_hfi1_rc_do_tid_send(qp, flag: false);
5362	spin_lock_irqsave(&qp->s_lock, ps.flags);
5363
5364	/ Return if we are already busy processing a work request. /
5365	if (!hfi1_send_tid_ok(qp)) {
5366	if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
5367	iowait_set_flag(wait: &priv->s_iowait, IOWAIT_PENDING_TID);
5368	spin_unlock_irqrestore(lock: &qp->s_lock, flags: ps.flags);
5369	return;
5370	}
5371
5372	priv->s_flags \|= RVT_S_BUSY;
5373
5374	ps.timeout = jiffies + ps.timeout_int;
5375	ps.cpu = priv->s_sde ? priv->s_sde->cpu :
5376	cpumask_first(srcp: cpumask_of_node(node: ps.ppd->dd->node));
5377	ps.pkts_sent = false;
5378
5379	/ insure a pre-built packet is handled /
5380	ps.s_txreq = get_waiting_verbs_txreq(w: ps.wait);
5381	do {
5382	/ Check for a constructed packet to be sent. /
5383	if (ps.s_txreq) {
5384	if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
5385	qp->s_flags \|= RVT_S_BUSY;
5386	ps.wait = iowait_get_ib_work(w: &priv->s_iowait);
5387	}
5388	spin_unlock_irqrestore(lock: &qp->s_lock, flags: ps.flags);
5389
5390	/*
5391	* If the packet cannot be sent now, return and
5392	* the send tasklet will be woken up later.
5393	*/
5394	if (hfi1_verbs_send(qp, ps: &ps))
5395	return;
5396
5397	/ allow other tasks to run /
5398	if (hfi1_schedule_send_yield(qp, ps: &ps, tid: true))
5399	return;
5400
5401	spin_lock_irqsave(&qp->s_lock, ps.flags);
5402	if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
5403	qp->s_flags &= ~RVT_S_BUSY;
5404	priv->s_flags &= ~HFI1_S_TID_BUSY_SET;
5405	ps.wait = iowait_get_tid_work(w: &priv->s_iowait);
5406	if (iowait_flag_set(wait: &priv->s_iowait,
5407	IOWAIT_PENDING_IB))
5408	hfi1_schedule_send(qp);
5409	}
5410	}
5411	} while (hfi1_make_tid_rdma_pkt(qp, ps: &ps));
5412	iowait_starve_clear(pkts_sent: ps.pkts_sent, w: &priv->s_iowait);
5413	spin_unlock_irqrestore(lock: &qp->s_lock, flags: ps.flags);
5414	}
5415
5416	static bool _hfi1_schedule_tid_send(struct rvt_qp *qp)
5417	{
5418	struct hfi1_qp_priv *priv = qp->priv;
5419	struct hfi1_ibport *ibp =
5420	to_iport(ibdev: qp->ibqp.device, port: qp->port_num);
5421	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
5422	struct hfi1_devdata *dd = ppd->dd;
5423
5424	if ((dd->flags & HFI1_SHUTDOWN))
5425	return true;
5426
5427	return iowait_tid_schedule(wait: &priv->s_iowait, wq: ppd->hfi1_wq,
5428	cpu: priv->s_sde ?
5429	priv->s_sde->cpu :
5430	cpumask_first(srcp: cpumask_of_node(node: dd->node)));
5431	}
5432
5433	/**
5434	* hfi1_schedule_tid_send - schedule progress on TID RDMA state machine
5435	* @qp: the QP
5436	*
5437	* This schedules qp progress on the TID RDMA state machine. Caller
5438	* should hold the s_lock.
5439	* Unlike hfi1_schedule_send(), this cannot use hfi1_send_ok() because
5440	* the two state machines can step on each other with respect to the
5441	* RVT_S_BUSY flag.
5442	* Therefore, a modified test is used.
5443	*
5444	* Return: %true if the second leg is scheduled;
5445	* %false if the second leg is not scheduled.
5446	*/
5447	bool hfi1_schedule_tid_send(struct rvt_qp *qp)
5448	{
5449	lockdep_assert_held(&qp->s_lock);
5450	if (hfi1_send_tid_ok(qp)) {
5451	/*
5452	* The following call returns true if the qp is not on the
5453	* queue and false if the qp is already on the queue before
5454	* this call. Either way, the qp will be on the queue when the
5455	* call returns.
5456	*/
5457	_hfi1_schedule_tid_send(qp);
5458	return true;
5459	}
5460	if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
5461	iowait_set_flag(wait: &((struct hfi1_qp_priv *)qp->priv)->s_iowait,
5462	IOWAIT_PENDING_TID);
5463	return false;
5464	}
5465
5466	bool hfi1_tid_rdma_ack_interlock(struct rvt_qp qp, struct* rvt_ack_entry *e)
5467	{
5468	struct rvt_ack_entry *prev;
5469	struct tid_rdma_request *req;
5470	struct hfi1_ibdev *dev = to_idev(ibdev: qp->ibqp.device);
5471	struct hfi1_qp_priv *priv = qp->priv;
5472	u32 s_prev;
5473
5474	s_prev = qp->s_tail_ack_queue == `0` ? rvt_size_atomic(rdi: &dev->rdi) :
5475	(qp->s_tail_ack_queue - `1`);
5476	prev = &qp->s_ack_queue[s_prev];
5477
5478	if ((e->opcode == TID_OP(READ_REQ) \|\|
5479	e->opcode == OP(RDMA_READ_REQUEST)) &&
5480	prev->opcode == TID_OP(WRITE_REQ)) {
5481	req = ack_to_tid_req(e: prev);
5482	if (req->ack_seg != req->total_segs) {
5483	priv->s_flags \|= HFI1_R_TID_WAIT_INTERLCK;
5484	return true;
5485	}
5486	}
5487	return false;
5488	}
5489
5490	static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx)
5491	{
5492	u64 reg;
5493
5494	/*
5495	* The only sane way to get the amount of
5496	* progress is to read the HW flow state.
5497	*/
5498	reg = read_uctxt_csr(dd, ctxt, RCV_TID_FLOW_TABLE + (`8` * fidx));
5499	return mask_psn(a: reg);
5500	}
5501
5502	static void tid_rdma_rcv_err(struct hfi1_packet *packet,
5503	struct ib_other_headers *ohdr,
5504	struct rvt_qp qp, u32 psn, int* diff, bool fecn)
5505	{
5506	unsigned long flags;
5507
5508	tid_rdma_rcv_error(packet, ohdr, qp, psn, diff);
5509	if (fecn) {
5510	spin_lock_irqsave(&qp->s_lock, flags);
5511	qp->s_flags \|= RVT_S_ECN;
5512	spin_unlock_irqrestore(lock: &qp->s_lock, flags);
5513	}
5514	}
5515
5516	static void update_r_next_psn_fecn(struct hfi1_packet *packet,
5517	struct hfi1_qp_priv *priv,
5518	struct hfi1_ctxtdata *rcd,
5519	struct tid_rdma_flow *flow,
5520	bool fecn)
5521	{
5522	/*
5523	* If a start/middle packet is delivered here due to
5524	* RSM rule and FECN, we need to update the r_next_psn.
5525	*/
5526	if (fecn && packet->etype == RHF_RCV_TYPE_EAGER &&
5527	!(priv->s_flags & HFI1_R_TID_SW_PSN)) {
5528	struct hfi1_devdata *dd = rcd->dd;
5529
5530	flow->flow_state.r_next_psn =
5531	read_r_next_psn(dd, ctxt: rcd->ctxt, fidx: flow->idx);
5532	}
5533	}
5534

source code of linux/drivers/infiniband/hw/hfi1/tid_rdma.c