1 | // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause |
2 | /* |
3 | * Copyright(c) 2015 - 2018 Intel Corporation. |
4 | */ |
5 | |
6 | #include <linux/io.h> |
7 | #include <rdma/rdma_vt.h> |
8 | #include <rdma/rdmavt_qp.h> |
9 | |
10 | #include "hfi.h" |
11 | #include "qp.h" |
12 | #include "rc.h" |
13 | #include "verbs_txreq.h" |
14 | #include "trace.h" |
15 | |
16 | struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev, |
17 | u8 *prev_ack, bool *scheduled) |
18 | __must_hold(&qp->s_lock) |
19 | { |
20 | struct rvt_ack_entry *e = NULL; |
21 | u8 i, p; |
22 | bool s = true; |
23 | |
24 | for (i = qp->r_head_ack_queue; ; i = p) { |
25 | if (i == qp->s_tail_ack_queue) |
26 | s = false; |
27 | if (i) |
28 | p = i - 1; |
29 | else |
30 | p = rvt_size_atomic(rdi: ib_to_rvt(ibdev: qp->ibqp.device)); |
31 | if (p == qp->r_head_ack_queue) { |
32 | e = NULL; |
33 | break; |
34 | } |
35 | e = &qp->s_ack_queue[p]; |
36 | if (!e->opcode) { |
37 | e = NULL; |
38 | break; |
39 | } |
40 | if (cmp_psn(a: psn, b: e->psn) >= 0) { |
41 | if (p == qp->s_tail_ack_queue && |
42 | cmp_psn(a: psn, b: e->lpsn) <= 0) |
43 | s = false; |
44 | break; |
45 | } |
46 | } |
47 | if (prev) |
48 | *prev = p; |
49 | if (prev_ack) |
50 | *prev_ack = i; |
51 | if (scheduled) |
52 | *scheduled = s; |
53 | return e; |
54 | } |
55 | |
56 | /** |
57 | * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read) |
58 | * @dev: the device for this QP |
59 | * @qp: a pointer to the QP |
60 | * @ohdr: a pointer to the IB header being constructed |
61 | * @ps: the xmit packet state |
62 | * |
63 | * Return 1 if constructed; otherwise, return 0. |
64 | * Note that we are in the responder's side of the QP context. |
65 | * Note the QP s_lock must be held. |
66 | */ |
67 | static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, |
68 | struct ib_other_headers *ohdr, |
69 | struct hfi1_pkt_state *ps) |
70 | { |
71 | struct rvt_ack_entry *e; |
72 | u32 hwords, hdrlen; |
73 | u32 len = 0; |
74 | u32 bth0 = 0, bth2 = 0; |
75 | u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT); |
76 | int middle = 0; |
77 | u32 pmtu = qp->pmtu; |
78 | struct hfi1_qp_priv *qpriv = qp->priv; |
79 | bool last_pkt; |
80 | u32 delta; |
81 | u8 next = qp->s_tail_ack_queue; |
82 | struct tid_rdma_request *req; |
83 | |
84 | trace_hfi1_rsp_make_rc_ack(qp, psn: 0); |
85 | lockdep_assert_held(&qp->s_lock); |
86 | /* Don't send an ACK if we aren't supposed to. */ |
87 | if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) |
88 | goto bail; |
89 | |
90 | if (qpriv->hdr_type == HFI1_PKT_TYPE_9B) |
91 | /* header size in 32-bit words LRH+BTH = (8+12)/4. */ |
92 | hwords = 5; |
93 | else |
94 | /* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */ |
95 | hwords = 7; |
96 | |
97 | switch (qp->s_ack_state) { |
98 | case OP(RDMA_READ_RESPONSE_LAST): |
99 | case OP(RDMA_READ_RESPONSE_ONLY): |
100 | e = &qp->s_ack_queue[qp->s_tail_ack_queue]; |
101 | release_rdma_sge_mr(e); |
102 | fallthrough; |
103 | case OP(ATOMIC_ACKNOWLEDGE): |
104 | /* |
105 | * We can increment the tail pointer now that the last |
106 | * response has been sent instead of only being |
107 | * constructed. |
108 | */ |
109 | if (++next > rvt_size_atomic(rdi: &dev->rdi)) |
110 | next = 0; |
111 | /* |
112 | * Only advance the s_acked_ack_queue pointer if there |
113 | * have been no TID RDMA requests. |
114 | */ |
115 | e = &qp->s_ack_queue[qp->s_tail_ack_queue]; |
116 | if (e->opcode != TID_OP(WRITE_REQ) && |
117 | qp->s_acked_ack_queue == qp->s_tail_ack_queue) |
118 | qp->s_acked_ack_queue = next; |
119 | qp->s_tail_ack_queue = next; |
120 | trace_hfi1_rsp_make_rc_ack(qp, psn: e->psn); |
121 | fallthrough; |
122 | case OP(SEND_ONLY): |
123 | case OP(ACKNOWLEDGE): |
124 | /* Check for no next entry in the queue. */ |
125 | if (qp->r_head_ack_queue == qp->s_tail_ack_queue) { |
126 | if (qp->s_flags & RVT_S_ACK_PENDING) |
127 | goto normal; |
128 | goto bail; |
129 | } |
130 | |
131 | e = &qp->s_ack_queue[qp->s_tail_ack_queue]; |
132 | /* Check for tid write fence */ |
133 | if ((qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) || |
134 | hfi1_tid_rdma_ack_interlock(qp, e)) { |
135 | iowait_set_flag(wait: &qpriv->s_iowait, IOWAIT_PENDING_IB); |
136 | goto bail; |
137 | } |
138 | if (e->opcode == OP(RDMA_READ_REQUEST)) { |
139 | /* |
140 | * If a RDMA read response is being resent and |
141 | * we haven't seen the duplicate request yet, |
142 | * then stop sending the remaining responses the |
143 | * responder has seen until the requester re-sends it. |
144 | */ |
145 | len = e->rdma_sge.sge_length; |
146 | if (len && !e->rdma_sge.mr) { |
147 | if (qp->s_acked_ack_queue == |
148 | qp->s_tail_ack_queue) |
149 | qp->s_acked_ack_queue = |
150 | qp->r_head_ack_queue; |
151 | qp->s_tail_ack_queue = qp->r_head_ack_queue; |
152 | goto bail; |
153 | } |
154 | /* Copy SGE state in case we need to resend */ |
155 | ps->s_txreq->mr = e->rdma_sge.mr; |
156 | if (ps->s_txreq->mr) |
157 | rvt_get_mr(mr: ps->s_txreq->mr); |
158 | qp->s_ack_rdma_sge.sge = e->rdma_sge; |
159 | qp->s_ack_rdma_sge.num_sge = 1; |
160 | ps->s_txreq->ss = &qp->s_ack_rdma_sge; |
161 | if (len > pmtu) { |
162 | len = pmtu; |
163 | qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST); |
164 | } else { |
165 | qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY); |
166 | e->sent = 1; |
167 | } |
168 | ohdr->u.aeth = rvt_compute_aeth(qp); |
169 | hwords++; |
170 | qp->s_ack_rdma_psn = e->psn; |
171 | bth2 = mask_psn(a: qp->s_ack_rdma_psn++); |
172 | } else if (e->opcode == TID_OP(WRITE_REQ)) { |
173 | /* |
174 | * If a TID RDMA WRITE RESP is being resent, we have to |
175 | * wait for the actual request. All requests that are to |
176 | * be resent will have their state set to |
177 | * TID_REQUEST_RESEND. When the new request arrives, the |
178 | * state will be changed to TID_REQUEST_RESEND_ACTIVE. |
179 | */ |
180 | req = ack_to_tid_req(e); |
181 | if (req->state == TID_REQUEST_RESEND || |
182 | req->state == TID_REQUEST_INIT_RESEND) |
183 | goto bail; |
184 | qp->s_ack_state = TID_OP(WRITE_RESP); |
185 | qp->s_ack_rdma_psn = mask_psn(a: e->psn + req->cur_seg); |
186 | goto write_resp; |
187 | } else if (e->opcode == TID_OP(READ_REQ)) { |
188 | /* |
189 | * If a TID RDMA read response is being resent and |
190 | * we haven't seen the duplicate request yet, |
191 | * then stop sending the remaining responses the |
192 | * responder has seen until the requester re-sends it. |
193 | */ |
194 | len = e->rdma_sge.sge_length; |
195 | if (len && !e->rdma_sge.mr) { |
196 | if (qp->s_acked_ack_queue == |
197 | qp->s_tail_ack_queue) |
198 | qp->s_acked_ack_queue = |
199 | qp->r_head_ack_queue; |
200 | qp->s_tail_ack_queue = qp->r_head_ack_queue; |
201 | goto bail; |
202 | } |
203 | /* Copy SGE state in case we need to resend */ |
204 | ps->s_txreq->mr = e->rdma_sge.mr; |
205 | if (ps->s_txreq->mr) |
206 | rvt_get_mr(mr: ps->s_txreq->mr); |
207 | qp->s_ack_rdma_sge.sge = e->rdma_sge; |
208 | qp->s_ack_rdma_sge.num_sge = 1; |
209 | qp->s_ack_state = TID_OP(READ_RESP); |
210 | goto read_resp; |
211 | } else { |
212 | /* COMPARE_SWAP or FETCH_ADD */ |
213 | ps->s_txreq->ss = NULL; |
214 | len = 0; |
215 | qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE); |
216 | ohdr->u.at.aeth = rvt_compute_aeth(qp); |
217 | ib_u64_put(val: e->atomic_data, p: &ohdr->u.at.atomic_ack_eth); |
218 | hwords += sizeof(ohdr->u.at) / sizeof(u32); |
219 | bth2 = mask_psn(a: e->psn); |
220 | e->sent = 1; |
221 | } |
222 | trace_hfi1_tid_write_rsp_make_rc_ack(qp); |
223 | bth0 = qp->s_ack_state << 24; |
224 | break; |
225 | |
226 | case OP(RDMA_READ_RESPONSE_FIRST): |
227 | qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE); |
228 | fallthrough; |
229 | case OP(RDMA_READ_RESPONSE_MIDDLE): |
230 | ps->s_txreq->ss = &qp->s_ack_rdma_sge; |
231 | ps->s_txreq->mr = qp->s_ack_rdma_sge.sge.mr; |
232 | if (ps->s_txreq->mr) |
233 | rvt_get_mr(mr: ps->s_txreq->mr); |
234 | len = qp->s_ack_rdma_sge.sge.sge_length; |
235 | if (len > pmtu) { |
236 | len = pmtu; |
237 | middle = HFI1_CAP_IS_KSET(SDMA_AHG); |
238 | } else { |
239 | ohdr->u.aeth = rvt_compute_aeth(qp); |
240 | hwords++; |
241 | qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); |
242 | e = &qp->s_ack_queue[qp->s_tail_ack_queue]; |
243 | e->sent = 1; |
244 | } |
245 | bth0 = qp->s_ack_state << 24; |
246 | bth2 = mask_psn(a: qp->s_ack_rdma_psn++); |
247 | break; |
248 | |
249 | case TID_OP(WRITE_RESP): |
250 | write_resp: |
251 | /* |
252 | * 1. Check if RVT_S_ACK_PENDING is set. If yes, |
253 | * goto normal. |
254 | * 2. Attempt to allocate TID resources. |
255 | * 3. Remove RVT_S_RESP_PENDING flags from s_flags |
256 | * 4. If resources not available: |
257 | * 4.1 Set RVT_S_WAIT_TID_SPACE |
258 | * 4.2 Queue QP on RCD TID queue |
259 | * 4.3 Put QP on iowait list. |
260 | * 4.4 Build IB RNR NAK with appropriate timeout value |
261 | * 4.5 Return indication progress made. |
262 | * 5. If resources are available: |
263 | * 5.1 Program HW flow CSRs |
264 | * 5.2 Build TID RDMA WRITE RESP packet |
265 | * 5.3 If more resources needed, do 2.1 - 2.3. |
266 | * 5.4 Wake up next QP on RCD TID queue. |
267 | * 5.5 Return indication progress made. |
268 | */ |
269 | |
270 | e = &qp->s_ack_queue[qp->s_tail_ack_queue]; |
271 | req = ack_to_tid_req(e); |
272 | |
273 | /* |
274 | * Send scheduled RNR NAK's. RNR NAK's need to be sent at |
275 | * segment boundaries, not at request boundaries. Don't change |
276 | * s_ack_state because we are still in the middle of a request |
277 | */ |
278 | if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND && |
279 | qp->s_tail_ack_queue == qpriv->r_tid_alloc && |
280 | req->cur_seg == req->alloc_seg) { |
281 | qpriv->rnr_nak_state = TID_RNR_NAK_SENT; |
282 | goto normal_no_state; |
283 | } |
284 | |
285 | bth2 = mask_psn(a: qp->s_ack_rdma_psn); |
286 | hdrlen = hfi1_build_tid_rdma_write_resp(qp, e, ohdr, bth1: &bth1, |
287 | bth2, len: &len, |
288 | ss: &ps->s_txreq->ss); |
289 | if (!hdrlen) |
290 | return 0; |
291 | |
292 | hwords += hdrlen; |
293 | bth0 = qp->s_ack_state << 24; |
294 | qp->s_ack_rdma_psn++; |
295 | trace_hfi1_tid_req_make_rc_ack_write(qp, newreq: 0, opcode: e->opcode, psn: e->psn, |
296 | lpsn: e->lpsn, req); |
297 | if (req->cur_seg != req->total_segs) |
298 | break; |
299 | |
300 | e->sent = 1; |
301 | /* Do not free e->rdma_sge until all data are received */ |
302 | qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE); |
303 | break; |
304 | |
305 | case TID_OP(READ_RESP): |
306 | read_resp: |
307 | e = &qp->s_ack_queue[qp->s_tail_ack_queue]; |
308 | ps->s_txreq->ss = &qp->s_ack_rdma_sge; |
309 | delta = hfi1_build_tid_rdma_read_resp(qp, e, ohdr, bth0: &bth0, |
310 | bth1: &bth1, bth2: &bth2, len: &len, |
311 | last: &last_pkt); |
312 | if (delta == 0) |
313 | goto error_qp; |
314 | hwords += delta; |
315 | if (last_pkt) { |
316 | e->sent = 1; |
317 | /* |
318 | * Increment qp->s_tail_ack_queue through s_ack_state |
319 | * transition. |
320 | */ |
321 | qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); |
322 | } |
323 | break; |
324 | case TID_OP(READ_REQ): |
325 | goto bail; |
326 | |
327 | default: |
328 | normal: |
329 | /* |
330 | * Send a regular ACK. |
331 | * Set the s_ack_state so we wait until after sending |
332 | * the ACK before setting s_ack_state to ACKNOWLEDGE |
333 | * (see above). |
334 | */ |
335 | qp->s_ack_state = OP(SEND_ONLY); |
336 | normal_no_state: |
337 | if (qp->s_nak_state) |
338 | ohdr->u.aeth = |
339 | cpu_to_be32((qp->r_msn & IB_MSN_MASK) | |
340 | (qp->s_nak_state << |
341 | IB_AETH_CREDIT_SHIFT)); |
342 | else |
343 | ohdr->u.aeth = rvt_compute_aeth(qp); |
344 | hwords++; |
345 | len = 0; |
346 | bth0 = OP(ACKNOWLEDGE) << 24; |
347 | bth2 = mask_psn(a: qp->s_ack_psn); |
348 | qp->s_flags &= ~RVT_S_ACK_PENDING; |
349 | ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP; |
350 | ps->s_txreq->ss = NULL; |
351 | } |
352 | qp->s_rdma_ack_cnt++; |
353 | ps->s_txreq->sde = qpriv->s_sde; |
354 | ps->s_txreq->s_cur_size = len; |
355 | ps->s_txreq->hdr_dwords = hwords; |
356 | hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps); |
357 | return 1; |
358 | error_qp: |
359 | spin_unlock_irqrestore(lock: &qp->s_lock, flags: ps->flags); |
360 | spin_lock_irqsave(&qp->r_lock, ps->flags); |
361 | spin_lock(lock: &qp->s_lock); |
362 | rvt_error_qp(qp, err: IB_WC_WR_FLUSH_ERR); |
363 | spin_unlock(lock: &qp->s_lock); |
364 | spin_unlock_irqrestore(lock: &qp->r_lock, flags: ps->flags); |
365 | spin_lock_irqsave(&qp->s_lock, ps->flags); |
366 | bail: |
367 | qp->s_ack_state = OP(ACKNOWLEDGE); |
368 | /* |
369 | * Ensure s_rdma_ack_cnt changes are committed prior to resetting |
370 | * RVT_S_RESP_PENDING |
371 | */ |
372 | smp_wmb(); |
373 | qp->s_flags &= ~(RVT_S_RESP_PENDING |
374 | | RVT_S_ACK_PENDING |
375 | | HFI1_S_AHG_VALID); |
376 | return 0; |
377 | } |
378 | |
379 | /** |
380 | * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC) |
381 | * @qp: a pointer to the QP |
382 | * @ps: the current packet state |
383 | * |
384 | * Assumes s_lock is held. |
385 | * |
386 | * Return 1 if constructed; otherwise, return 0. |
387 | */ |
388 | int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) |
389 | { |
390 | struct hfi1_qp_priv *priv = qp->priv; |
391 | struct hfi1_ibdev *dev = to_idev(ibdev: qp->ibqp.device); |
392 | struct ib_other_headers *ohdr; |
393 | struct rvt_sge_state *ss = NULL; |
394 | struct rvt_swqe *wqe; |
395 | struct hfi1_swqe_priv *wpriv; |
396 | struct tid_rdma_request *req = NULL; |
397 | /* header size in 32-bit words LRH+BTH = (8+12)/4. */ |
398 | u32 hwords = 5; |
399 | u32 len = 0; |
400 | u32 bth0 = 0, bth2 = 0; |
401 | u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT); |
402 | u32 pmtu = qp->pmtu; |
403 | char newreq; |
404 | int middle = 0; |
405 | int delta; |
406 | struct tid_rdma_flow *flow = NULL; |
407 | struct tid_rdma_params *remote; |
408 | |
409 | trace_hfi1_sender_make_rc_req(qp); |
410 | lockdep_assert_held(&qp->s_lock); |
411 | ps->s_txreq = get_txreq(dev: ps->dev, qp); |
412 | if (!ps->s_txreq) |
413 | goto bail_no_tx; |
414 | |
415 | if (priv->hdr_type == HFI1_PKT_TYPE_9B) { |
416 | /* header size in 32-bit words LRH+BTH = (8+12)/4. */ |
417 | hwords = 5; |
418 | if (rdma_ah_get_ah_flags(attr: &qp->remote_ah_attr) & IB_AH_GRH) |
419 | ohdr = &ps->s_txreq->phdr.hdr.ibh.u.l.oth; |
420 | else |
421 | ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth; |
422 | } else { |
423 | /* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */ |
424 | hwords = 7; |
425 | if ((rdma_ah_get_ah_flags(attr: &qp->remote_ah_attr) & IB_AH_GRH) && |
426 | (hfi1_check_mcast(lid: rdma_ah_get_dlid(attr: &qp->remote_ah_attr)))) |
427 | ohdr = &ps->s_txreq->phdr.hdr.opah.u.l.oth; |
428 | else |
429 | ohdr = &ps->s_txreq->phdr.hdr.opah.u.oth; |
430 | } |
431 | |
432 | /* Sending responses has higher priority over sending requests. */ |
433 | if ((qp->s_flags & RVT_S_RESP_PENDING) && |
434 | make_rc_ack(dev, qp, ohdr, ps)) |
435 | return 1; |
436 | |
437 | if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) { |
438 | if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) |
439 | goto bail; |
440 | /* We are in the error state, flush the work request. */ |
441 | if (qp->s_last == READ_ONCE(qp->s_head)) |
442 | goto bail; |
443 | /* If DMAs are in progress, we can't flush immediately. */ |
444 | if (iowait_sdma_pending(wait: &priv->s_iowait)) { |
445 | qp->s_flags |= RVT_S_WAIT_DMA; |
446 | goto bail; |
447 | } |
448 | clear_ahg(qp); |
449 | wqe = rvt_get_swqe_ptr(qp, n: qp->s_last); |
450 | hfi1_trdma_send_complete(qp, wqe, status: qp->s_last != qp->s_acked ? |
451 | IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); |
452 | /* will get called again */ |
453 | goto done_free_tx; |
454 | } |
455 | |
456 | if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK | HFI1_S_WAIT_HALT)) |
457 | goto bail; |
458 | |
459 | if (cmp_psn(a: qp->s_psn, b: qp->s_sending_hpsn) <= 0) { |
460 | if (cmp_psn(a: qp->s_sending_psn, b: qp->s_sending_hpsn) <= 0) { |
461 | qp->s_flags |= RVT_S_WAIT_PSN; |
462 | goto bail; |
463 | } |
464 | qp->s_sending_psn = qp->s_psn; |
465 | qp->s_sending_hpsn = qp->s_psn - 1; |
466 | } |
467 | |
468 | /* Send a request. */ |
469 | wqe = rvt_get_swqe_ptr(qp, n: qp->s_cur); |
470 | check_s_state: |
471 | switch (qp->s_state) { |
472 | default: |
473 | if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK)) |
474 | goto bail; |
475 | /* |
476 | * Resend an old request or start a new one. |
477 | * |
478 | * We keep track of the current SWQE so that |
479 | * we don't reset the "furthest progress" state |
480 | * if we need to back up. |
481 | */ |
482 | newreq = 0; |
483 | if (qp->s_cur == qp->s_tail) { |
484 | /* Check if send work queue is empty. */ |
485 | if (qp->s_tail == READ_ONCE(qp->s_head)) { |
486 | clear_ahg(qp); |
487 | goto bail; |
488 | } |
489 | /* |
490 | * If a fence is requested, wait for previous |
491 | * RDMA read and atomic operations to finish. |
492 | * However, there is no need to guard against |
493 | * TID RDMA READ after TID RDMA READ. |
494 | */ |
495 | if ((wqe->wr.send_flags & IB_SEND_FENCE) && |
496 | qp->s_num_rd_atomic && |
497 | (wqe->wr.opcode != IB_WR_TID_RDMA_READ || |
498 | priv->pending_tid_r_segs < qp->s_num_rd_atomic)) { |
499 | qp->s_flags |= RVT_S_WAIT_FENCE; |
500 | goto bail; |
501 | } |
502 | /* |
503 | * Local operations are processed immediately |
504 | * after all prior requests have completed |
505 | */ |
506 | if (wqe->wr.opcode == IB_WR_REG_MR || |
507 | wqe->wr.opcode == IB_WR_LOCAL_INV) { |
508 | int local_ops = 0; |
509 | int err = 0; |
510 | |
511 | if (qp->s_last != qp->s_cur) |
512 | goto bail; |
513 | if (++qp->s_cur == qp->s_size) |
514 | qp->s_cur = 0; |
515 | if (++qp->s_tail == qp->s_size) |
516 | qp->s_tail = 0; |
517 | if (!(wqe->wr.send_flags & |
518 | RVT_SEND_COMPLETION_ONLY)) { |
519 | err = rvt_invalidate_rkey( |
520 | qp, |
521 | rkey: wqe->wr.ex.invalidate_rkey); |
522 | local_ops = 1; |
523 | } |
524 | rvt_send_complete(qp, wqe, |
525 | status: err ? IB_WC_LOC_PROT_ERR |
526 | : IB_WC_SUCCESS); |
527 | if (local_ops) |
528 | atomic_dec(v: &qp->local_ops_pending); |
529 | goto done_free_tx; |
530 | } |
531 | |
532 | newreq = 1; |
533 | qp->s_psn = wqe->psn; |
534 | } |
535 | /* |
536 | * Note that we have to be careful not to modify the |
537 | * original work request since we may need to resend |
538 | * it. |
539 | */ |
540 | len = wqe->length; |
541 | ss = &qp->s_sge; |
542 | bth2 = mask_psn(a: qp->s_psn); |
543 | |
544 | /* |
545 | * Interlock between various IB requests and TID RDMA |
546 | * if necessary. |
547 | */ |
548 | if ((priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) || |
549 | hfi1_tid_rdma_wqe_interlock(qp, wqe)) |
550 | goto bail; |
551 | |
552 | switch (wqe->wr.opcode) { |
553 | case IB_WR_SEND: |
554 | case IB_WR_SEND_WITH_IMM: |
555 | case IB_WR_SEND_WITH_INV: |
556 | /* If no credit, return. */ |
557 | if (!rvt_rc_credit_avail(qp, wqe)) |
558 | goto bail; |
559 | if (len > pmtu) { |
560 | qp->s_state = OP(SEND_FIRST); |
561 | len = pmtu; |
562 | break; |
563 | } |
564 | if (wqe->wr.opcode == IB_WR_SEND) { |
565 | qp->s_state = OP(SEND_ONLY); |
566 | } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { |
567 | qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE); |
568 | /* Immediate data comes after the BTH */ |
569 | ohdr->u.imm_data = wqe->wr.ex.imm_data; |
570 | hwords += 1; |
571 | } else { |
572 | qp->s_state = OP(SEND_ONLY_WITH_INVALIDATE); |
573 | /* Invalidate rkey comes after the BTH */ |
574 | ohdr->u.ieth = cpu_to_be32( |
575 | wqe->wr.ex.invalidate_rkey); |
576 | hwords += 1; |
577 | } |
578 | if (wqe->wr.send_flags & IB_SEND_SOLICITED) |
579 | bth0 |= IB_BTH_SOLICITED; |
580 | bth2 |= IB_BTH_REQ_ACK; |
581 | if (++qp->s_cur == qp->s_size) |
582 | qp->s_cur = 0; |
583 | break; |
584 | |
585 | case IB_WR_RDMA_WRITE: |
586 | if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) |
587 | qp->s_lsn++; |
588 | goto no_flow_control; |
589 | case IB_WR_RDMA_WRITE_WITH_IMM: |
590 | /* If no credit, return. */ |
591 | if (!rvt_rc_credit_avail(qp, wqe)) |
592 | goto bail; |
593 | no_flow_control: |
594 | put_ib_reth_vaddr( |
595 | val: wqe->rdma_wr.remote_addr, |
596 | reth: &ohdr->u.rc.reth); |
597 | ohdr->u.rc.reth.rkey = |
598 | cpu_to_be32(wqe->rdma_wr.rkey); |
599 | ohdr->u.rc.reth.length = cpu_to_be32(len); |
600 | hwords += sizeof(struct ib_reth) / sizeof(u32); |
601 | if (len > pmtu) { |
602 | qp->s_state = OP(RDMA_WRITE_FIRST); |
603 | len = pmtu; |
604 | break; |
605 | } |
606 | if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { |
607 | qp->s_state = OP(RDMA_WRITE_ONLY); |
608 | } else { |
609 | qp->s_state = |
610 | OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); |
611 | /* Immediate data comes after RETH */ |
612 | ohdr->u.rc.imm_data = wqe->wr.ex.imm_data; |
613 | hwords += 1; |
614 | if (wqe->wr.send_flags & IB_SEND_SOLICITED) |
615 | bth0 |= IB_BTH_SOLICITED; |
616 | } |
617 | bth2 |= IB_BTH_REQ_ACK; |
618 | if (++qp->s_cur == qp->s_size) |
619 | qp->s_cur = 0; |
620 | break; |
621 | |
622 | case IB_WR_TID_RDMA_WRITE: |
623 | if (newreq) { |
624 | /* |
625 | * Limit the number of TID RDMA WRITE requests. |
626 | */ |
627 | if (atomic_read(v: &priv->n_tid_requests) >= |
628 | HFI1_TID_RDMA_WRITE_CNT) |
629 | goto bail; |
630 | |
631 | if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) |
632 | qp->s_lsn++; |
633 | } |
634 | |
635 | hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, |
636 | bth1: &bth1, bth2: &bth2, |
637 | len: &len); |
638 | ss = NULL; |
639 | if (priv->s_tid_cur == HFI1_QP_WQE_INVALID) { |
640 | priv->s_tid_cur = qp->s_cur; |
641 | if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) { |
642 | priv->s_tid_tail = qp->s_cur; |
643 | priv->s_state = TID_OP(WRITE_RESP); |
644 | } |
645 | } else if (priv->s_tid_cur == priv->s_tid_head) { |
646 | struct rvt_swqe *__w; |
647 | struct tid_rdma_request *__r; |
648 | |
649 | __w = rvt_get_swqe_ptr(qp, n: priv->s_tid_cur); |
650 | __r = wqe_to_tid_req(wqe: __w); |
651 | |
652 | /* |
653 | * The s_tid_cur pointer is advanced to s_cur if |
654 | * any of the following conditions about the WQE |
655 | * to which s_ti_cur currently points to are |
656 | * satisfied: |
657 | * 1. The request is not a TID RDMA WRITE |
658 | * request, |
659 | * 2. The request is in the INACTIVE or |
660 | * COMPLETE states (TID RDMA READ requests |
661 | * stay at INACTIVE and TID RDMA WRITE |
662 | * transition to COMPLETE when done), |
663 | * 3. The request is in the ACTIVE or SYNC |
664 | * state and the number of completed |
665 | * segments is equal to the total segment |
666 | * count. |
667 | * (If ACTIVE, the request is waiting for |
668 | * ACKs. If SYNC, the request has not |
669 | * received any responses because it's |
670 | * waiting on a sync point.) |
671 | */ |
672 | if (__w->wr.opcode != IB_WR_TID_RDMA_WRITE || |
673 | __r->state == TID_REQUEST_INACTIVE || |
674 | __r->state == TID_REQUEST_COMPLETE || |
675 | ((__r->state == TID_REQUEST_ACTIVE || |
676 | __r->state == TID_REQUEST_SYNC) && |
677 | __r->comp_seg == __r->total_segs)) { |
678 | if (priv->s_tid_tail == |
679 | priv->s_tid_cur && |
680 | priv->s_state == |
681 | TID_OP(WRITE_DATA_LAST)) { |
682 | priv->s_tid_tail = qp->s_cur; |
683 | priv->s_state = |
684 | TID_OP(WRITE_RESP); |
685 | } |
686 | priv->s_tid_cur = qp->s_cur; |
687 | } |
688 | /* |
689 | * A corner case: when the last TID RDMA WRITE |
690 | * request was completed, s_tid_head, |
691 | * s_tid_cur, and s_tid_tail all point to the |
692 | * same location. Other requests are posted and |
693 | * s_cur wraps around to the same location, |
694 | * where a new TID RDMA WRITE is posted. In |
695 | * this case, none of the indices need to be |
696 | * updated. However, the priv->s_state should. |
697 | */ |
698 | if (priv->s_tid_tail == qp->s_cur && |
699 | priv->s_state == TID_OP(WRITE_DATA_LAST)) |
700 | priv->s_state = TID_OP(WRITE_RESP); |
701 | } |
702 | req = wqe_to_tid_req(wqe); |
703 | if (newreq) { |
704 | priv->s_tid_head = qp->s_cur; |
705 | priv->pending_tid_w_resp += req->total_segs; |
706 | atomic_inc(v: &priv->n_tid_requests); |
707 | atomic_dec(v: &priv->n_requests); |
708 | } else { |
709 | req->state = TID_REQUEST_RESEND; |
710 | req->comp_seg = delta_psn(a: bth2, b: wqe->psn); |
711 | /* |
712 | * Pull back any segments since we are going |
713 | * to re-receive them. |
714 | */ |
715 | req->setup_head = req->clear_tail; |
716 | priv->pending_tid_w_resp += |
717 | delta_psn(a: wqe->lpsn, b: bth2) + 1; |
718 | } |
719 | |
720 | trace_hfi1_tid_write_sender_make_req(qp, newreq); |
721 | trace_hfi1_tid_req_make_req_write(qp, newreq, |
722 | opcode: wqe->wr.opcode, |
723 | psn: wqe->psn, lpsn: wqe->lpsn, |
724 | req); |
725 | if (++qp->s_cur == qp->s_size) |
726 | qp->s_cur = 0; |
727 | break; |
728 | |
729 | case IB_WR_RDMA_READ: |
730 | /* |
731 | * Don't allow more operations to be started |
732 | * than the QP limits allow. |
733 | */ |
734 | if (qp->s_num_rd_atomic >= |
735 | qp->s_max_rd_atomic) { |
736 | qp->s_flags |= RVT_S_WAIT_RDMAR; |
737 | goto bail; |
738 | } |
739 | qp->s_num_rd_atomic++; |
740 | if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) |
741 | qp->s_lsn++; |
742 | put_ib_reth_vaddr( |
743 | val: wqe->rdma_wr.remote_addr, |
744 | reth: &ohdr->u.rc.reth); |
745 | ohdr->u.rc.reth.rkey = |
746 | cpu_to_be32(wqe->rdma_wr.rkey); |
747 | ohdr->u.rc.reth.length = cpu_to_be32(len); |
748 | qp->s_state = OP(RDMA_READ_REQUEST); |
749 | hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); |
750 | ss = NULL; |
751 | len = 0; |
752 | bth2 |= IB_BTH_REQ_ACK; |
753 | if (++qp->s_cur == qp->s_size) |
754 | qp->s_cur = 0; |
755 | break; |
756 | |
757 | case IB_WR_TID_RDMA_READ: |
758 | trace_hfi1_tid_read_sender_make_req(qp, newreq); |
759 | wpriv = wqe->priv; |
760 | req = wqe_to_tid_req(wqe); |
761 | trace_hfi1_tid_req_make_req_read(qp, newreq, |
762 | opcode: wqe->wr.opcode, |
763 | psn: wqe->psn, lpsn: wqe->lpsn, |
764 | req); |
765 | delta = cmp_psn(a: qp->s_psn, b: wqe->psn); |
766 | |
767 | /* |
768 | * Don't allow more operations to be started |
769 | * than the QP limits allow. We could get here under |
770 | * three conditions; (1) It's a new request; (2) We are |
771 | * sending the second or later segment of a request, |
772 | * but the qp->s_state is set to OP(RDMA_READ_REQUEST) |
773 | * when the last segment of a previous request is |
774 | * received just before this; (3) We are re-sending a |
775 | * request. |
776 | */ |
777 | if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) { |
778 | qp->s_flags |= RVT_S_WAIT_RDMAR; |
779 | goto bail; |
780 | } |
781 | if (newreq) { |
782 | struct tid_rdma_flow *flow = |
783 | &req->flows[req->setup_head]; |
784 | |
785 | /* |
786 | * Set up s_sge as it is needed for TID |
787 | * allocation. However, if the pages have been |
788 | * walked and mapped, skip it. An earlier try |
789 | * has failed to allocate the TID entries. |
790 | */ |
791 | if (!flow->npagesets) { |
792 | qp->s_sge.sge = wqe->sg_list[0]; |
793 | qp->s_sge.sg_list = wqe->sg_list + 1; |
794 | qp->s_sge.num_sge = wqe->wr.num_sge; |
795 | qp->s_sge.total_len = wqe->length; |
796 | qp->s_len = wqe->length; |
797 | req->isge = 0; |
798 | req->clear_tail = req->setup_head; |
799 | req->flow_idx = req->setup_head; |
800 | req->state = TID_REQUEST_ACTIVE; |
801 | } |
802 | } else if (delta == 0) { |
803 | /* Re-send a request */ |
804 | req->cur_seg = 0; |
805 | req->comp_seg = 0; |
806 | req->ack_pending = 0; |
807 | req->flow_idx = req->clear_tail; |
808 | req->state = TID_REQUEST_RESEND; |
809 | } |
810 | req->s_next_psn = qp->s_psn; |
811 | /* Read one segment at a time */ |
812 | len = min_t(u32, req->seg_len, |
813 | wqe->length - req->seg_len * req->cur_seg); |
814 | delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr, |
815 | bth1: &bth1, bth2: &bth2, |
816 | len: &len); |
817 | if (delta <= 0) { |
818 | /* Wait for TID space */ |
819 | goto bail; |
820 | } |
821 | if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) |
822 | qp->s_lsn++; |
823 | hwords += delta; |
824 | ss = &wpriv->ss; |
825 | /* Check if this is the last segment */ |
826 | if (req->cur_seg >= req->total_segs && |
827 | ++qp->s_cur == qp->s_size) |
828 | qp->s_cur = 0; |
829 | break; |
830 | |
831 | case IB_WR_ATOMIC_CMP_AND_SWP: |
832 | case IB_WR_ATOMIC_FETCH_AND_ADD: |
833 | /* |
834 | * Don't allow more operations to be started |
835 | * than the QP limits allow. |
836 | */ |
837 | if (qp->s_num_rd_atomic >= |
838 | qp->s_max_rd_atomic) { |
839 | qp->s_flags |= RVT_S_WAIT_RDMAR; |
840 | goto bail; |
841 | } |
842 | qp->s_num_rd_atomic++; |
843 | fallthrough; |
844 | case IB_WR_OPFN: |
845 | if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) |
846 | qp->s_lsn++; |
847 | if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || |
848 | wqe->wr.opcode == IB_WR_OPFN) { |
849 | qp->s_state = OP(COMPARE_SWAP); |
850 | put_ib_ateth_swap(val: wqe->atomic_wr.swap, |
851 | ateth: &ohdr->u.atomic_eth); |
852 | put_ib_ateth_compare(val: wqe->atomic_wr.compare_add, |
853 | ateth: &ohdr->u.atomic_eth); |
854 | } else { |
855 | qp->s_state = OP(FETCH_ADD); |
856 | put_ib_ateth_swap(val: wqe->atomic_wr.compare_add, |
857 | ateth: &ohdr->u.atomic_eth); |
858 | put_ib_ateth_compare(val: 0, ateth: &ohdr->u.atomic_eth); |
859 | } |
860 | put_ib_ateth_vaddr(val: wqe->atomic_wr.remote_addr, |
861 | ateth: &ohdr->u.atomic_eth); |
862 | ohdr->u.atomic_eth.rkey = cpu_to_be32( |
863 | wqe->atomic_wr.rkey); |
864 | hwords += sizeof(struct ib_atomic_eth) / sizeof(u32); |
865 | ss = NULL; |
866 | len = 0; |
867 | bth2 |= IB_BTH_REQ_ACK; |
868 | if (++qp->s_cur == qp->s_size) |
869 | qp->s_cur = 0; |
870 | break; |
871 | |
872 | default: |
873 | goto bail; |
874 | } |
875 | if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) { |
876 | qp->s_sge.sge = wqe->sg_list[0]; |
877 | qp->s_sge.sg_list = wqe->sg_list + 1; |
878 | qp->s_sge.num_sge = wqe->wr.num_sge; |
879 | qp->s_sge.total_len = wqe->length; |
880 | qp->s_len = wqe->length; |
881 | } |
882 | if (newreq) { |
883 | qp->s_tail++; |
884 | if (qp->s_tail >= qp->s_size) |
885 | qp->s_tail = 0; |
886 | } |
887 | if (wqe->wr.opcode == IB_WR_RDMA_READ || |
888 | wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) |
889 | qp->s_psn = wqe->lpsn + 1; |
890 | else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) |
891 | qp->s_psn = req->s_next_psn; |
892 | else |
893 | qp->s_psn++; |
894 | break; |
895 | |
896 | case OP(RDMA_READ_RESPONSE_FIRST): |
897 | /* |
898 | * qp->s_state is normally set to the opcode of the |
899 | * last packet constructed for new requests and therefore |
900 | * is never set to RDMA read response. |
901 | * RDMA_READ_RESPONSE_FIRST is used by the ACK processing |
902 | * thread to indicate a SEND needs to be restarted from an |
903 | * earlier PSN without interfering with the sending thread. |
904 | * See restart_rc(). |
905 | */ |
906 | qp->s_len = restart_sge(ss: &qp->s_sge, wqe, psn: qp->s_psn, pmtu); |
907 | fallthrough; |
908 | case OP(SEND_FIRST): |
909 | qp->s_state = OP(SEND_MIDDLE); |
910 | fallthrough; |
911 | case OP(SEND_MIDDLE): |
912 | bth2 = mask_psn(a: qp->s_psn++); |
913 | ss = &qp->s_sge; |
914 | len = qp->s_len; |
915 | if (len > pmtu) { |
916 | len = pmtu; |
917 | middle = HFI1_CAP_IS_KSET(SDMA_AHG); |
918 | break; |
919 | } |
920 | if (wqe->wr.opcode == IB_WR_SEND) { |
921 | qp->s_state = OP(SEND_LAST); |
922 | } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { |
923 | qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); |
924 | /* Immediate data comes after the BTH */ |
925 | ohdr->u.imm_data = wqe->wr.ex.imm_data; |
926 | hwords += 1; |
927 | } else { |
928 | qp->s_state = OP(SEND_LAST_WITH_INVALIDATE); |
929 | /* invalidate data comes after the BTH */ |
930 | ohdr->u.ieth = cpu_to_be32(wqe->wr.ex.invalidate_rkey); |
931 | hwords += 1; |
932 | } |
933 | if (wqe->wr.send_flags & IB_SEND_SOLICITED) |
934 | bth0 |= IB_BTH_SOLICITED; |
935 | bth2 |= IB_BTH_REQ_ACK; |
936 | qp->s_cur++; |
937 | if (qp->s_cur >= qp->s_size) |
938 | qp->s_cur = 0; |
939 | break; |
940 | |
941 | case OP(RDMA_READ_RESPONSE_LAST): |
942 | /* |
943 | * qp->s_state is normally set to the opcode of the |
944 | * last packet constructed for new requests and therefore |
945 | * is never set to RDMA read response. |
946 | * RDMA_READ_RESPONSE_LAST is used by the ACK processing |
947 | * thread to indicate a RDMA write needs to be restarted from |
948 | * an earlier PSN without interfering with the sending thread. |
949 | * See restart_rc(). |
950 | */ |
951 | qp->s_len = restart_sge(ss: &qp->s_sge, wqe, psn: qp->s_psn, pmtu); |
952 | fallthrough; |
953 | case OP(RDMA_WRITE_FIRST): |
954 | qp->s_state = OP(RDMA_WRITE_MIDDLE); |
955 | fallthrough; |
956 | case OP(RDMA_WRITE_MIDDLE): |
957 | bth2 = mask_psn(a: qp->s_psn++); |
958 | ss = &qp->s_sge; |
959 | len = qp->s_len; |
960 | if (len > pmtu) { |
961 | len = pmtu; |
962 | middle = HFI1_CAP_IS_KSET(SDMA_AHG); |
963 | break; |
964 | } |
965 | if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { |
966 | qp->s_state = OP(RDMA_WRITE_LAST); |
967 | } else { |
968 | qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); |
969 | /* Immediate data comes after the BTH */ |
970 | ohdr->u.imm_data = wqe->wr.ex.imm_data; |
971 | hwords += 1; |
972 | if (wqe->wr.send_flags & IB_SEND_SOLICITED) |
973 | bth0 |= IB_BTH_SOLICITED; |
974 | } |
975 | bth2 |= IB_BTH_REQ_ACK; |
976 | qp->s_cur++; |
977 | if (qp->s_cur >= qp->s_size) |
978 | qp->s_cur = 0; |
979 | break; |
980 | |
981 | case OP(RDMA_READ_RESPONSE_MIDDLE): |
982 | /* |
983 | * qp->s_state is normally set to the opcode of the |
984 | * last packet constructed for new requests and therefore |
985 | * is never set to RDMA read response. |
986 | * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing |
987 | * thread to indicate a RDMA read needs to be restarted from |
988 | * an earlier PSN without interfering with the sending thread. |
989 | * See restart_rc(). |
990 | */ |
991 | len = (delta_psn(a: qp->s_psn, b: wqe->psn)) * pmtu; |
992 | put_ib_reth_vaddr( |
993 | val: wqe->rdma_wr.remote_addr + len, |
994 | reth: &ohdr->u.rc.reth); |
995 | ohdr->u.rc.reth.rkey = |
996 | cpu_to_be32(wqe->rdma_wr.rkey); |
997 | ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len); |
998 | qp->s_state = OP(RDMA_READ_REQUEST); |
999 | hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); |
1000 | bth2 = mask_psn(a: qp->s_psn) | IB_BTH_REQ_ACK; |
1001 | qp->s_psn = wqe->lpsn + 1; |
1002 | ss = NULL; |
1003 | len = 0; |
1004 | qp->s_cur++; |
1005 | if (qp->s_cur == qp->s_size) |
1006 | qp->s_cur = 0; |
1007 | break; |
1008 | |
1009 | case TID_OP(WRITE_RESP): |
1010 | /* |
1011 | * This value for s_state is used for restarting a TID RDMA |
1012 | * WRITE request. See comment in OP(RDMA_READ_RESPONSE_MIDDLE |
1013 | * for more). |
1014 | */ |
1015 | req = wqe_to_tid_req(wqe); |
1016 | req->state = TID_REQUEST_RESEND; |
1017 | rcu_read_lock(); |
1018 | remote = rcu_dereference(priv->tid_rdma.remote); |
1019 | req->comp_seg = delta_psn(a: qp->s_psn, b: wqe->psn); |
1020 | len = wqe->length - (req->comp_seg * remote->max_len); |
1021 | rcu_read_unlock(); |
1022 | |
1023 | bth2 = mask_psn(a: qp->s_psn); |
1024 | hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, bth1: &bth1, |
1025 | bth2: &bth2, len: &len); |
1026 | qp->s_psn = wqe->lpsn + 1; |
1027 | ss = NULL; |
1028 | qp->s_state = TID_OP(WRITE_REQ); |
1029 | priv->pending_tid_w_resp += delta_psn(a: wqe->lpsn, b: bth2) + 1; |
1030 | priv->s_tid_cur = qp->s_cur; |
1031 | if (++qp->s_cur == qp->s_size) |
1032 | qp->s_cur = 0; |
1033 | trace_hfi1_tid_req_make_req_write(qp, newreq: 0, opcode: wqe->wr.opcode, |
1034 | psn: wqe->psn, lpsn: wqe->lpsn, req); |
1035 | break; |
1036 | |
1037 | case TID_OP(READ_RESP): |
1038 | if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) |
1039 | goto bail; |
1040 | /* This is used to restart a TID read request */ |
1041 | req = wqe_to_tid_req(wqe); |
1042 | wpriv = wqe->priv; |
1043 | /* |
1044 | * Back down. The field qp->s_psn has been set to the psn with |
1045 | * which the request should be restart. It's OK to use division |
1046 | * as this is on the retry path. |
1047 | */ |
1048 | req->cur_seg = delta_psn(a: qp->s_psn, b: wqe->psn) / priv->pkts_ps; |
1049 | |
1050 | /* |
1051 | * The following function need to be redefined to return the |
1052 | * status to make sure that we find the flow. At the same |
1053 | * time, we can use the req->state change to check if the |
1054 | * call succeeds or not. |
1055 | */ |
1056 | req->state = TID_REQUEST_RESEND; |
1057 | hfi1_tid_rdma_restart_req(qp, wqe, bth2: &bth2); |
1058 | if (req->state != TID_REQUEST_ACTIVE) { |
1059 | /* |
1060 | * Failed to find the flow. Release all allocated tid |
1061 | * resources. |
1062 | */ |
1063 | hfi1_kern_exp_rcv_clear_all(req); |
1064 | hfi1_kern_clear_hw_flow(rcd: priv->rcd, qp); |
1065 | |
1066 | hfi1_trdma_send_complete(qp, wqe, status: IB_WC_LOC_QP_OP_ERR); |
1067 | goto bail; |
1068 | } |
1069 | req->state = TID_REQUEST_RESEND; |
1070 | len = min_t(u32, req->seg_len, |
1071 | wqe->length - req->seg_len * req->cur_seg); |
1072 | flow = &req->flows[req->flow_idx]; |
1073 | len -= flow->sent; |
1074 | req->s_next_psn = flow->flow_state.ib_lpsn + 1; |
1075 | delta = hfi1_build_tid_rdma_read_packet(wqe, ohdr, bth1: &bth1, |
1076 | bth2: &bth2, len: &len); |
1077 | if (delta <= 0) { |
1078 | /* Wait for TID space */ |
1079 | goto bail; |
1080 | } |
1081 | hwords += delta; |
1082 | ss = &wpriv->ss; |
1083 | /* Check if this is the last segment */ |
1084 | if (req->cur_seg >= req->total_segs && |
1085 | ++qp->s_cur == qp->s_size) |
1086 | qp->s_cur = 0; |
1087 | qp->s_psn = req->s_next_psn; |
1088 | trace_hfi1_tid_req_make_req_read(qp, newreq: 0, opcode: wqe->wr.opcode, |
1089 | psn: wqe->psn, lpsn: wqe->lpsn, req); |
1090 | break; |
1091 | case TID_OP(READ_REQ): |
1092 | req = wqe_to_tid_req(wqe); |
1093 | delta = cmp_psn(a: qp->s_psn, b: wqe->psn); |
1094 | /* |
1095 | * If the current WR is not TID RDMA READ, or this is the start |
1096 | * of a new request, we need to change the qp->s_state so that |
1097 | * the request can be set up properly. |
1098 | */ |
1099 | if (wqe->wr.opcode != IB_WR_TID_RDMA_READ || delta == 0 || |
1100 | qp->s_cur == qp->s_tail) { |
1101 | qp->s_state = OP(RDMA_READ_REQUEST); |
1102 | if (delta == 0 || qp->s_cur == qp->s_tail) |
1103 | goto check_s_state; |
1104 | else |
1105 | goto bail; |
1106 | } |
1107 | |
1108 | /* Rate limiting */ |
1109 | if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) { |
1110 | qp->s_flags |= RVT_S_WAIT_RDMAR; |
1111 | goto bail; |
1112 | } |
1113 | |
1114 | wpriv = wqe->priv; |
1115 | /* Read one segment at a time */ |
1116 | len = min_t(u32, req->seg_len, |
1117 | wqe->length - req->seg_len * req->cur_seg); |
1118 | delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr, bth1: &bth1, |
1119 | bth2: &bth2, len: &len); |
1120 | if (delta <= 0) { |
1121 | /* Wait for TID space */ |
1122 | goto bail; |
1123 | } |
1124 | hwords += delta; |
1125 | ss = &wpriv->ss; |
1126 | /* Check if this is the last segment */ |
1127 | if (req->cur_seg >= req->total_segs && |
1128 | ++qp->s_cur == qp->s_size) |
1129 | qp->s_cur = 0; |
1130 | qp->s_psn = req->s_next_psn; |
1131 | trace_hfi1_tid_req_make_req_read(qp, newreq: 0, opcode: wqe->wr.opcode, |
1132 | psn: wqe->psn, lpsn: wqe->lpsn, req); |
1133 | break; |
1134 | } |
1135 | qp->s_sending_hpsn = bth2; |
1136 | delta = delta_psn(a: bth2, b: wqe->psn); |
1137 | if (delta && delta % HFI1_PSN_CREDIT == 0 && |
1138 | wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) |
1139 | bth2 |= IB_BTH_REQ_ACK; |
1140 | if (qp->s_flags & RVT_S_SEND_ONE) { |
1141 | qp->s_flags &= ~RVT_S_SEND_ONE; |
1142 | qp->s_flags |= RVT_S_WAIT_ACK; |
1143 | bth2 |= IB_BTH_REQ_ACK; |
1144 | } |
1145 | qp->s_len -= len; |
1146 | ps->s_txreq->hdr_dwords = hwords; |
1147 | ps->s_txreq->sde = priv->s_sde; |
1148 | ps->s_txreq->ss = ss; |
1149 | ps->s_txreq->s_cur_size = len; |
1150 | hfi1_make_ruc_header( |
1151 | qp, |
1152 | ohdr, |
1153 | bth0: bth0 | (qp->s_state << 24), |
1154 | bth1, |
1155 | bth2, |
1156 | middle, |
1157 | ps); |
1158 | return 1; |
1159 | |
1160 | done_free_tx: |
1161 | hfi1_put_txreq(tx: ps->s_txreq); |
1162 | ps->s_txreq = NULL; |
1163 | return 1; |
1164 | |
1165 | bail: |
1166 | hfi1_put_txreq(tx: ps->s_txreq); |
1167 | |
1168 | bail_no_tx: |
1169 | ps->s_txreq = NULL; |
1170 | qp->s_flags &= ~RVT_S_BUSY; |
1171 | /* |
1172 | * If we didn't get a txreq, the QP will be woken up later to try |
1173 | * again. Set the flags to indicate which work item to wake |
1174 | * up. |
1175 | */ |
1176 | iowait_set_flag(wait: &priv->s_iowait, IOWAIT_PENDING_IB); |
1177 | return 0; |
1178 | } |
1179 | |
1180 | static inline void hfi1_make_bth_aeth(struct rvt_qp *qp, |
1181 | struct ib_other_headers *ohdr, |
1182 | u32 bth0, u32 bth1) |
1183 | { |
1184 | if (qp->r_nak_state) |
1185 | ohdr->u.aeth = cpu_to_be32((qp->r_msn & IB_MSN_MASK) | |
1186 | (qp->r_nak_state << |
1187 | IB_AETH_CREDIT_SHIFT)); |
1188 | else |
1189 | ohdr->u.aeth = rvt_compute_aeth(qp); |
1190 | |
1191 | ohdr->bth[0] = cpu_to_be32(bth0); |
1192 | ohdr->bth[1] = cpu_to_be32(bth1 | qp->remote_qpn); |
1193 | ohdr->bth[2] = cpu_to_be32(mask_psn(qp->r_ack_psn)); |
1194 | } |
1195 | |
1196 | static inline void hfi1_queue_rc_ack(struct hfi1_packet *packet, bool is_fecn) |
1197 | { |
1198 | struct rvt_qp *qp = packet->qp; |
1199 | struct hfi1_ibport *ibp; |
1200 | unsigned long flags; |
1201 | |
1202 | spin_lock_irqsave(&qp->s_lock, flags); |
1203 | if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) |
1204 | goto unlock; |
1205 | ibp = rcd_to_iport(rcd: packet->rcd); |
1206 | this_cpu_inc(*ibp->rvp.rc_qacks); |
1207 | qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING; |
1208 | qp->s_nak_state = qp->r_nak_state; |
1209 | qp->s_ack_psn = qp->r_ack_psn; |
1210 | if (is_fecn) |
1211 | qp->s_flags |= RVT_S_ECN; |
1212 | |
1213 | /* Schedule the send tasklet. */ |
1214 | hfi1_schedule_send(qp); |
1215 | unlock: |
1216 | spin_unlock_irqrestore(lock: &qp->s_lock, flags); |
1217 | } |
1218 | |
1219 | static inline void hfi1_make_rc_ack_9B(struct hfi1_packet *packet, |
1220 | struct hfi1_opa_header *opa_hdr, |
1221 | u8 sc5, bool is_fecn, |
1222 | u64 *pbc_flags, u32 *hwords, |
1223 | u32 *nwords) |
1224 | { |
1225 | struct rvt_qp *qp = packet->qp; |
1226 | struct hfi1_ibport *ibp = rcd_to_iport(rcd: packet->rcd); |
1227 | struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); |
1228 | struct ib_header *hdr = &opa_hdr->ibh; |
1229 | struct ib_other_headers *ohdr; |
1230 | u16 lrh0 = HFI1_LRH_BTH; |
1231 | u16 pkey; |
1232 | u32 bth0, bth1; |
1233 | |
1234 | opa_hdr->hdr_type = HFI1_PKT_TYPE_9B; |
1235 | ohdr = &hdr->u.oth; |
1236 | /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */ |
1237 | *hwords = 6; |
1238 | |
1239 | if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)) { |
1240 | *hwords += hfi1_make_grh(ibp, hdr: &hdr->u.l.grh, |
1241 | grh: rdma_ah_read_grh(attr: &qp->remote_ah_attr), |
1242 | hwords: *hwords - 2, SIZE_OF_CRC); |
1243 | ohdr = &hdr->u.l.oth; |
1244 | lrh0 = HFI1_LRH_GRH; |
1245 | } |
1246 | /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */ |
1247 | *pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT); |
1248 | |
1249 | /* read pkey_index w/o lock (its atomic) */ |
1250 | pkey = hfi1_get_pkey(ibp, index: qp->s_pkey_index); |
1251 | |
1252 | lrh0 |= (sc5 & IB_SC_MASK) << IB_SC_SHIFT | |
1253 | (rdma_ah_get_sl(attr: &qp->remote_ah_attr) & IB_SL_MASK) << |
1254 | IB_SL_SHIFT; |
1255 | |
1256 | hfi1_make_ib_hdr(hdr, lrh0, len: *hwords + SIZE_OF_CRC, |
1257 | opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr), 9B), |
1258 | slid: ppd->lid | rdma_ah_get_path_bits(attr: &qp->remote_ah_attr)); |
1259 | |
1260 | bth0 = pkey | (OP(ACKNOWLEDGE) << 24); |
1261 | if (qp->s_mig_state == IB_MIG_MIGRATED) |
1262 | bth0 |= IB_BTH_MIG_REQ; |
1263 | bth1 = (!!is_fecn) << IB_BECN_SHIFT; |
1264 | /* |
1265 | * Inline ACKs go out without the use of the Verbs send engine, so |
1266 | * we need to set the STL Verbs Extended bit here |
1267 | */ |
1268 | bth1 |= HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT; |
1269 | hfi1_make_bth_aeth(qp, ohdr, bth0, bth1); |
1270 | } |
1271 | |
1272 | static inline void hfi1_make_rc_ack_16B(struct hfi1_packet *packet, |
1273 | struct hfi1_opa_header *opa_hdr, |
1274 | u8 sc5, bool is_fecn, |
1275 | u64 *pbc_flags, u32 *hwords, |
1276 | u32 *nwords) |
1277 | { |
1278 | struct rvt_qp *qp = packet->qp; |
1279 | struct hfi1_ibport *ibp = rcd_to_iport(rcd: packet->rcd); |
1280 | struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); |
1281 | struct hfi1_16b_header *hdr = &opa_hdr->opah; |
1282 | struct ib_other_headers *ohdr; |
1283 | u32 bth0, bth1 = 0; |
1284 | u16 len, pkey; |
1285 | bool becn = is_fecn; |
1286 | u8 l4 = OPA_16B_L4_IB_LOCAL; |
1287 | u8 ; |
1288 | |
1289 | opa_hdr->hdr_type = HFI1_PKT_TYPE_16B; |
1290 | ohdr = &hdr->u.oth; |
1291 | /* header size in 32-bit words 16B LRH+BTH+AETH = (16+12+4)/4 */ |
1292 | *hwords = 8; |
1293 | extra_bytes = hfi1_get_16b_padding(hdr_size: *hwords << 2, payload: 0); |
1294 | *nwords = SIZE_OF_CRC + ((extra_bytes + SIZE_OF_LT) >> 2); |
1295 | |
1296 | if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) && |
1297 | hfi1_check_mcast(lid: rdma_ah_get_dlid(attr: &qp->remote_ah_attr))) { |
1298 | *hwords += hfi1_make_grh(ibp, hdr: &hdr->u.l.grh, |
1299 | grh: rdma_ah_read_grh(attr: &qp->remote_ah_attr), |
1300 | hwords: *hwords - 4, nwords: *nwords); |
1301 | ohdr = &hdr->u.l.oth; |
1302 | l4 = OPA_16B_L4_IB_GLOBAL; |
1303 | } |
1304 | *pbc_flags |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC; |
1305 | |
1306 | /* read pkey_index w/o lock (its atomic) */ |
1307 | pkey = hfi1_get_pkey(ibp, index: qp->s_pkey_index); |
1308 | |
1309 | /* Convert dwords to flits */ |
1310 | len = (*hwords + *nwords) >> 1; |
1311 | |
1312 | hfi1_make_16b_hdr(hdr, slid: ppd->lid | |
1313 | (rdma_ah_get_path_bits(attr: &qp->remote_ah_attr) & |
1314 | ((1 << ppd->lmc) - 1)), |
1315 | opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr), |
1316 | 16B), len, pkey, becn, fecn: 0, l4, sc: sc5); |
1317 | |
1318 | bth0 = pkey | (OP(ACKNOWLEDGE) << 24); |
1319 | bth0 |= extra_bytes << 20; |
1320 | if (qp->s_mig_state == IB_MIG_MIGRATED) |
1321 | bth1 = OPA_BTH_MIG_REQ; |
1322 | hfi1_make_bth_aeth(qp, ohdr, bth0, bth1); |
1323 | } |
1324 | |
1325 | typedef void (*hfi1_make_rc_ack)(struct hfi1_packet *packet, |
1326 | struct hfi1_opa_header *opa_hdr, |
1327 | u8 sc5, bool is_fecn, |
1328 | u64 *pbc_flags, u32 *hwords, |
1329 | u32 *nwords); |
1330 | |
1331 | /* We support only two types - 9B and 16B for now */ |
1332 | static const hfi1_make_rc_ack hfi1_make_rc_ack_tbl[2] = { |
1333 | [HFI1_PKT_TYPE_9B] = &hfi1_make_rc_ack_9B, |
1334 | [HFI1_PKT_TYPE_16B] = &hfi1_make_rc_ack_16B |
1335 | }; |
1336 | |
1337 | /* |
1338 | * hfi1_send_rc_ack - Construct an ACK packet and send it |
1339 | * |
1340 | * This is called from hfi1_rc_rcv() and handle_receive_interrupt(). |
1341 | * Note that RDMA reads and atomics are handled in the |
1342 | * send side QP state and send engine. |
1343 | */ |
1344 | void hfi1_send_rc_ack(struct hfi1_packet *packet, bool is_fecn) |
1345 | { |
1346 | struct hfi1_ctxtdata *rcd = packet->rcd; |
1347 | struct rvt_qp *qp = packet->qp; |
1348 | struct hfi1_ibport *ibp = rcd_to_iport(rcd); |
1349 | struct hfi1_qp_priv *priv = qp->priv; |
1350 | struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); |
1351 | u8 sc5 = ibp->sl_to_sc[rdma_ah_get_sl(attr: &qp->remote_ah_attr)]; |
1352 | u64 pbc, pbc_flags = 0; |
1353 | u32 hwords = 0; |
1354 | u32 nwords = 0; |
1355 | u32 plen; |
1356 | struct pio_buf *pbuf; |
1357 | struct hfi1_opa_header opa_hdr; |
1358 | |
1359 | /* clear the defer count */ |
1360 | qp->r_adefered = 0; |
1361 | |
1362 | /* Don't send ACK or NAK if a RDMA read or atomic is pending. */ |
1363 | if (qp->s_flags & RVT_S_RESP_PENDING) { |
1364 | hfi1_queue_rc_ack(packet, is_fecn); |
1365 | return; |
1366 | } |
1367 | |
1368 | /* Ensure s_rdma_ack_cnt changes are committed */ |
1369 | if (qp->s_rdma_ack_cnt) { |
1370 | hfi1_queue_rc_ack(packet, is_fecn); |
1371 | return; |
1372 | } |
1373 | |
1374 | /* Don't try to send ACKs if the link isn't ACTIVE */ |
1375 | if (driver_lstate(ppd) != IB_PORT_ACTIVE) |
1376 | return; |
1377 | |
1378 | /* Make the appropriate header */ |
1379 | hfi1_make_rc_ack_tbl[priv->hdr_type](packet, &opa_hdr, sc5, is_fecn, |
1380 | &pbc_flags, &hwords, &nwords); |
1381 | |
1382 | plen = 2 /* PBC */ + hwords + nwords; |
1383 | pbc = create_pbc(ppd, flags: pbc_flags, srate_mbs: qp->srate_mbps, |
1384 | vl: sc_to_vlt(dd: ppd->dd, sc5), dw_len: plen); |
1385 | pbuf = sc_buffer_alloc(sc: rcd->sc, dw_len: plen, NULL, NULL); |
1386 | if (IS_ERR_OR_NULL(ptr: pbuf)) { |
1387 | /* |
1388 | * We have no room to send at the moment. Pass |
1389 | * responsibility for sending the ACK to the send engine |
1390 | * so that when enough buffer space becomes available, |
1391 | * the ACK is sent ahead of other outgoing packets. |
1392 | */ |
1393 | hfi1_queue_rc_ack(packet, is_fecn); |
1394 | return; |
1395 | } |
1396 | trace_ack_output_ibhdr(dd: dd_from_ibdev(ibdev: qp->ibqp.device), |
1397 | opah: &opa_hdr, sc5: ib_is_sc5(sc5)); |
1398 | |
1399 | /* write the pbc and data */ |
1400 | ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, |
1401 | (priv->hdr_type == HFI1_PKT_TYPE_9B ? |
1402 | (void *)&opa_hdr.ibh : |
1403 | (void *)&opa_hdr.opah), hwords); |
1404 | return; |
1405 | } |
1406 | |
1407 | /** |
1408 | * update_num_rd_atomic - update the qp->s_num_rd_atomic |
1409 | * @qp: the QP |
1410 | * @psn: the packet sequence number to restart at |
1411 | * @wqe: the wqe |
1412 | * |
1413 | * This is called from reset_psn() to update qp->s_num_rd_atomic |
1414 | * for the current wqe. |
1415 | * Called at interrupt level with the QP s_lock held. |
1416 | */ |
1417 | static void update_num_rd_atomic(struct rvt_qp *qp, u32 psn, |
1418 | struct rvt_swqe *wqe) |
1419 | { |
1420 | u32 opcode = wqe->wr.opcode; |
1421 | |
1422 | if (opcode == IB_WR_RDMA_READ || |
1423 | opcode == IB_WR_ATOMIC_CMP_AND_SWP || |
1424 | opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { |
1425 | qp->s_num_rd_atomic++; |
1426 | } else if (opcode == IB_WR_TID_RDMA_READ) { |
1427 | struct tid_rdma_request *req = wqe_to_tid_req(wqe); |
1428 | struct hfi1_qp_priv *priv = qp->priv; |
1429 | |
1430 | if (cmp_psn(a: psn, b: wqe->lpsn) <= 0) { |
1431 | u32 cur_seg; |
1432 | |
1433 | cur_seg = (psn - wqe->psn) / priv->pkts_ps; |
1434 | req->ack_pending = cur_seg - req->comp_seg; |
1435 | priv->pending_tid_r_segs += req->ack_pending; |
1436 | qp->s_num_rd_atomic += req->ack_pending; |
1437 | trace_hfi1_tid_req_update_num_rd_atomic(qp, newreq: 0, |
1438 | opcode: wqe->wr.opcode, |
1439 | psn: wqe->psn, |
1440 | lpsn: wqe->lpsn, |
1441 | req); |
1442 | } else { |
1443 | priv->pending_tid_r_segs += req->total_segs; |
1444 | qp->s_num_rd_atomic += req->total_segs; |
1445 | } |
1446 | } |
1447 | } |
1448 | |
1449 | /** |
1450 | * reset_psn - reset the QP state to send starting from PSN |
1451 | * @qp: the QP |
1452 | * @psn: the packet sequence number to restart at |
1453 | * |
1454 | * This is called from hfi1_rc_rcv() to process an incoming RC ACK |
1455 | * for the given QP. |
1456 | * Called at interrupt level with the QP s_lock held. |
1457 | */ |
1458 | static void reset_psn(struct rvt_qp *qp, u32 psn) |
1459 | { |
1460 | u32 n = qp->s_acked; |
1461 | struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n); |
1462 | u32 opcode; |
1463 | struct hfi1_qp_priv *priv = qp->priv; |
1464 | |
1465 | lockdep_assert_held(&qp->s_lock); |
1466 | qp->s_cur = n; |
1467 | priv->pending_tid_r_segs = 0; |
1468 | priv->pending_tid_w_resp = 0; |
1469 | qp->s_num_rd_atomic = 0; |
1470 | |
1471 | /* |
1472 | * If we are starting the request from the beginning, |
1473 | * let the normal send code handle initialization. |
1474 | */ |
1475 | if (cmp_psn(a: psn, b: wqe->psn) <= 0) { |
1476 | qp->s_state = OP(SEND_LAST); |
1477 | goto done; |
1478 | } |
1479 | update_num_rd_atomic(qp, psn, wqe); |
1480 | |
1481 | /* Find the work request opcode corresponding to the given PSN. */ |
1482 | for (;;) { |
1483 | int diff; |
1484 | |
1485 | if (++n == qp->s_size) |
1486 | n = 0; |
1487 | if (n == qp->s_tail) |
1488 | break; |
1489 | wqe = rvt_get_swqe_ptr(qp, n); |
1490 | diff = cmp_psn(a: psn, b: wqe->psn); |
1491 | if (diff < 0) { |
1492 | /* Point wqe back to the previous one*/ |
1493 | wqe = rvt_get_swqe_ptr(qp, n: qp->s_cur); |
1494 | break; |
1495 | } |
1496 | qp->s_cur = n; |
1497 | /* |
1498 | * If we are starting the request from the beginning, |
1499 | * let the normal send code handle initialization. |
1500 | */ |
1501 | if (diff == 0) { |
1502 | qp->s_state = OP(SEND_LAST); |
1503 | goto done; |
1504 | } |
1505 | |
1506 | update_num_rd_atomic(qp, psn, wqe); |
1507 | } |
1508 | opcode = wqe->wr.opcode; |
1509 | |
1510 | /* |
1511 | * Set the state to restart in the middle of a request. |
1512 | * Don't change the s_sge, s_cur_sge, or s_cur_size. |
1513 | * See hfi1_make_rc_req(). |
1514 | */ |
1515 | switch (opcode) { |
1516 | case IB_WR_SEND: |
1517 | case IB_WR_SEND_WITH_IMM: |
1518 | qp->s_state = OP(RDMA_READ_RESPONSE_FIRST); |
1519 | break; |
1520 | |
1521 | case IB_WR_RDMA_WRITE: |
1522 | case IB_WR_RDMA_WRITE_WITH_IMM: |
1523 | qp->s_state = OP(RDMA_READ_RESPONSE_LAST); |
1524 | break; |
1525 | |
1526 | case IB_WR_TID_RDMA_WRITE: |
1527 | qp->s_state = TID_OP(WRITE_RESP); |
1528 | break; |
1529 | |
1530 | case IB_WR_RDMA_READ: |
1531 | qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); |
1532 | break; |
1533 | |
1534 | case IB_WR_TID_RDMA_READ: |
1535 | qp->s_state = TID_OP(READ_RESP); |
1536 | break; |
1537 | |
1538 | default: |
1539 | /* |
1540 | * This case shouldn't happen since its only |
1541 | * one PSN per req. |
1542 | */ |
1543 | qp->s_state = OP(SEND_LAST); |
1544 | } |
1545 | done: |
1546 | priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK; |
1547 | qp->s_psn = psn; |
1548 | /* |
1549 | * Set RVT_S_WAIT_PSN as rc_complete() may start the timer |
1550 | * asynchronously before the send engine can get scheduled. |
1551 | * Doing it in hfi1_make_rc_req() is too late. |
1552 | */ |
1553 | if ((cmp_psn(a: qp->s_psn, b: qp->s_sending_hpsn) <= 0) && |
1554 | (cmp_psn(a: qp->s_sending_psn, b: qp->s_sending_hpsn) <= 0)) |
1555 | qp->s_flags |= RVT_S_WAIT_PSN; |
1556 | qp->s_flags &= ~HFI1_S_AHG_VALID; |
1557 | trace_hfi1_sender_reset_psn(qp); |
1558 | } |
1559 | |
1560 | /* |
1561 | * Back up requester to resend the last un-ACKed request. |
1562 | * The QP r_lock and s_lock should be held and interrupts disabled. |
1563 | */ |
1564 | void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) |
1565 | { |
1566 | struct hfi1_qp_priv *priv = qp->priv; |
1567 | struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n: qp->s_acked); |
1568 | struct hfi1_ibport *ibp; |
1569 | |
1570 | lockdep_assert_held(&qp->r_lock); |
1571 | lockdep_assert_held(&qp->s_lock); |
1572 | trace_hfi1_sender_restart_rc(qp); |
1573 | if (qp->s_retry == 0) { |
1574 | if (qp->s_mig_state == IB_MIG_ARMED) { |
1575 | hfi1_migrate_qp(qp); |
1576 | qp->s_retry = qp->s_retry_cnt; |
1577 | } else if (qp->s_last == qp->s_acked) { |
1578 | /* |
1579 | * We need special handling for the OPFN request WQEs as |
1580 | * they are not allowed to generate real user errors |
1581 | */ |
1582 | if (wqe->wr.opcode == IB_WR_OPFN) { |
1583 | struct hfi1_ibport *ibp = |
1584 | to_iport(ibdev: qp->ibqp.device, port: qp->port_num); |
1585 | /* |
1586 | * Call opfn_conn_reply() with capcode and |
1587 | * remaining data as 0 to close out the |
1588 | * current request |
1589 | */ |
1590 | opfn_conn_reply(qp, data: priv->opfn.curr); |
1591 | wqe = do_rc_completion(qp, wqe, ibp); |
1592 | qp->s_flags &= ~RVT_S_WAIT_ACK; |
1593 | } else { |
1594 | trace_hfi1_tid_write_sender_restart_rc(qp, newreq: 0); |
1595 | if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { |
1596 | struct tid_rdma_request *req; |
1597 | |
1598 | req = wqe_to_tid_req(wqe); |
1599 | hfi1_kern_exp_rcv_clear_all(req); |
1600 | hfi1_kern_clear_hw_flow(rcd: priv->rcd, qp); |
1601 | } |
1602 | |
1603 | hfi1_trdma_send_complete(qp, wqe, |
1604 | status: IB_WC_RETRY_EXC_ERR); |
1605 | rvt_error_qp(qp, err: IB_WC_WR_FLUSH_ERR); |
1606 | } |
1607 | return; |
1608 | } else { /* need to handle delayed completion */ |
1609 | return; |
1610 | } |
1611 | } else { |
1612 | qp->s_retry--; |
1613 | } |
1614 | |
1615 | ibp = to_iport(ibdev: qp->ibqp.device, port: qp->port_num); |
1616 | if (wqe->wr.opcode == IB_WR_RDMA_READ || |
1617 | wqe->wr.opcode == IB_WR_TID_RDMA_READ) |
1618 | ibp->rvp.n_rc_resends++; |
1619 | else |
1620 | ibp->rvp.n_rc_resends += delta_psn(a: qp->s_psn, b: psn); |
1621 | |
1622 | qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR | |
1623 | RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN | |
1624 | RVT_S_WAIT_ACK | HFI1_S_WAIT_TID_RESP); |
1625 | if (wait) |
1626 | qp->s_flags |= RVT_S_SEND_ONE; |
1627 | reset_psn(qp, psn); |
1628 | } |
1629 | |
1630 | /* |
1631 | * Set qp->s_sending_psn to the next PSN after the given one. |
1632 | * This would be psn+1 except when RDMA reads or TID RDMA ops |
1633 | * are present. |
1634 | */ |
1635 | static void reset_sending_psn(struct rvt_qp *qp, u32 psn) |
1636 | { |
1637 | struct rvt_swqe *wqe; |
1638 | u32 n = qp->s_last; |
1639 | |
1640 | lockdep_assert_held(&qp->s_lock); |
1641 | /* Find the work request corresponding to the given PSN. */ |
1642 | for (;;) { |
1643 | wqe = rvt_get_swqe_ptr(qp, n); |
1644 | if (cmp_psn(a: psn, b: wqe->lpsn) <= 0) { |
1645 | if (wqe->wr.opcode == IB_WR_RDMA_READ || |
1646 | wqe->wr.opcode == IB_WR_TID_RDMA_READ || |
1647 | wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) |
1648 | qp->s_sending_psn = wqe->lpsn + 1; |
1649 | else |
1650 | qp->s_sending_psn = psn + 1; |
1651 | break; |
1652 | } |
1653 | if (++n == qp->s_size) |
1654 | n = 0; |
1655 | if (n == qp->s_tail) |
1656 | break; |
1657 | } |
1658 | } |
1659 | |
1660 | /** |
1661 | * hfi1_rc_verbs_aborted - handle abort status |
1662 | * @qp: the QP |
1663 | * @opah: the opa header |
1664 | * |
1665 | * This code modifies both ACK bit in BTH[2] |
1666 | * and the s_flags to go into send one mode. |
1667 | * |
1668 | * This serves to throttle the send engine to only |
1669 | * send a single packet in the likely case the |
1670 | * a link has gone down. |
1671 | */ |
1672 | void hfi1_rc_verbs_aborted(struct rvt_qp *qp, struct hfi1_opa_header *opah) |
1673 | { |
1674 | struct ib_other_headers *ohdr = hfi1_get_rc_ohdr(opah); |
1675 | u8 opcode = ib_bth_get_opcode(ohdr); |
1676 | u32 psn; |
1677 | |
1678 | /* ignore responses */ |
1679 | if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) && |
1680 | opcode <= OP(ATOMIC_ACKNOWLEDGE)) || |
1681 | opcode == TID_OP(READ_RESP) || |
1682 | opcode == TID_OP(WRITE_RESP)) |
1683 | return; |
1684 | |
1685 | psn = ib_bth_get_psn(ohdr) | IB_BTH_REQ_ACK; |
1686 | ohdr->bth[2] = cpu_to_be32(psn); |
1687 | qp->s_flags |= RVT_S_SEND_ONE; |
1688 | } |
1689 | |
1690 | /* |
1691 | * This should be called with the QP s_lock held and interrupts disabled. |
1692 | */ |
1693 | void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) |
1694 | { |
1695 | struct ib_other_headers *ohdr; |
1696 | struct hfi1_qp_priv *priv = qp->priv; |
1697 | struct rvt_swqe *wqe; |
1698 | u32 opcode, head, tail; |
1699 | u32 psn; |
1700 | struct tid_rdma_request *req; |
1701 | |
1702 | lockdep_assert_held(&qp->s_lock); |
1703 | if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK)) |
1704 | return; |
1705 | |
1706 | ohdr = hfi1_get_rc_ohdr(opah); |
1707 | opcode = ib_bth_get_opcode(ohdr); |
1708 | if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) && |
1709 | opcode <= OP(ATOMIC_ACKNOWLEDGE)) || |
1710 | opcode == TID_OP(READ_RESP) || |
1711 | opcode == TID_OP(WRITE_RESP)) { |
1712 | WARN_ON(!qp->s_rdma_ack_cnt); |
1713 | qp->s_rdma_ack_cnt--; |
1714 | return; |
1715 | } |
1716 | |
1717 | psn = ib_bth_get_psn(ohdr); |
1718 | /* |
1719 | * Don't attempt to reset the sending PSN for packets in the |
1720 | * KDETH PSN space since the PSN does not match anything. |
1721 | */ |
1722 | if (opcode != TID_OP(WRITE_DATA) && |
1723 | opcode != TID_OP(WRITE_DATA_LAST) && |
1724 | opcode != TID_OP(ACK) && opcode != TID_OP(RESYNC)) |
1725 | reset_sending_psn(qp, psn); |
1726 | |
1727 | /* Handle TID RDMA WRITE packets differently */ |
1728 | if (opcode >= TID_OP(WRITE_REQ) && |
1729 | opcode <= TID_OP(WRITE_DATA_LAST)) { |
1730 | head = priv->s_tid_head; |
1731 | tail = priv->s_tid_cur; |
1732 | /* |
1733 | * s_tid_cur is set to s_tid_head in the case, where |
1734 | * a new TID RDMA request is being started and all |
1735 | * previous ones have been completed. |
1736 | * Therefore, we need to do a secondary check in order |
1737 | * to properly determine whether we should start the |
1738 | * RC timer. |
1739 | */ |
1740 | wqe = rvt_get_swqe_ptr(qp, n: tail); |
1741 | req = wqe_to_tid_req(wqe); |
1742 | if (head == tail && req->comp_seg < req->total_segs) { |
1743 | if (tail == 0) |
1744 | tail = qp->s_size - 1; |
1745 | else |
1746 | tail -= 1; |
1747 | } |
1748 | } else { |
1749 | head = qp->s_tail; |
1750 | tail = qp->s_acked; |
1751 | } |
1752 | |
1753 | /* |
1754 | * Start timer after a packet requesting an ACK has been sent and |
1755 | * there are still requests that haven't been acked. |
1756 | */ |
1757 | if ((psn & IB_BTH_REQ_ACK) && tail != head && |
1758 | opcode != TID_OP(WRITE_DATA) && opcode != TID_OP(WRITE_DATA_LAST) && |
1759 | opcode != TID_OP(RESYNC) && |
1760 | !(qp->s_flags & |
1761 | (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) && |
1762 | (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { |
1763 | if (opcode == TID_OP(READ_REQ)) |
1764 | rvt_add_retry_timer_ext(qp, shift: priv->timeout_shift); |
1765 | else |
1766 | rvt_add_retry_timer(qp); |
1767 | } |
1768 | |
1769 | /* Start TID RDMA ACK timer */ |
1770 | if ((opcode == TID_OP(WRITE_DATA) || |
1771 | opcode == TID_OP(WRITE_DATA_LAST) || |
1772 | opcode == TID_OP(RESYNC)) && |
1773 | (psn & IB_BTH_REQ_ACK) && |
1774 | !(priv->s_flags & HFI1_S_TID_RETRY_TIMER) && |
1775 | (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { |
1776 | /* |
1777 | * The TID RDMA ACK packet could be received before this |
1778 | * function is called. Therefore, add the timer only if TID |
1779 | * RDMA ACK packets are actually pending. |
1780 | */ |
1781 | wqe = rvt_get_swqe_ptr(qp, n: qp->s_acked); |
1782 | req = wqe_to_tid_req(wqe); |
1783 | if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && |
1784 | req->ack_seg < req->cur_seg) |
1785 | hfi1_add_tid_retry_timer(qp); |
1786 | } |
1787 | |
1788 | while (qp->s_last != qp->s_acked) { |
1789 | wqe = rvt_get_swqe_ptr(qp, n: qp->s_last); |
1790 | if (cmp_psn(a: wqe->lpsn, b: qp->s_sending_psn) >= 0 && |
1791 | cmp_psn(a: qp->s_sending_psn, b: qp->s_sending_hpsn) <= 0) |
1792 | break; |
1793 | trdma_clean_swqe(qp, wqe); |
1794 | trace_hfi1_qp_send_completion(qp, wqe, idx: qp->s_last); |
1795 | rvt_qp_complete_swqe(qp, |
1796 | wqe, |
1797 | opcode: ib_hfi1_wc_opcode[wqe->wr.opcode], |
1798 | status: IB_WC_SUCCESS); |
1799 | } |
1800 | /* |
1801 | * If we were waiting for sends to complete before re-sending, |
1802 | * and they are now complete, restart sending. |
1803 | */ |
1804 | trace_hfi1_sendcomplete(qp, psn); |
1805 | if (qp->s_flags & RVT_S_WAIT_PSN && |
1806 | cmp_psn(a: qp->s_sending_psn, b: qp->s_sending_hpsn) > 0) { |
1807 | qp->s_flags &= ~RVT_S_WAIT_PSN; |
1808 | qp->s_sending_psn = qp->s_psn; |
1809 | qp->s_sending_hpsn = qp->s_psn - 1; |
1810 | hfi1_schedule_send(qp); |
1811 | } |
1812 | } |
1813 | |
1814 | static inline void update_last_psn(struct rvt_qp *qp, u32 psn) |
1815 | { |
1816 | qp->s_last_psn = psn; |
1817 | } |
1818 | |
1819 | /* |
1820 | * Generate a SWQE completion. |
1821 | * This is similar to hfi1_send_complete but has to check to be sure |
1822 | * that the SGEs are not being referenced if the SWQE is being resent. |
1823 | */ |
1824 | struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, |
1825 | struct rvt_swqe *wqe, |
1826 | struct hfi1_ibport *ibp) |
1827 | { |
1828 | struct hfi1_qp_priv *priv = qp->priv; |
1829 | |
1830 | lockdep_assert_held(&qp->s_lock); |
1831 | /* |
1832 | * Don't decrement refcount and don't generate a |
1833 | * completion if the SWQE is being resent until the send |
1834 | * is finished. |
1835 | */ |
1836 | trace_hfi1_rc_completion(qp, psn: wqe->lpsn); |
1837 | if (cmp_psn(a: wqe->lpsn, b: qp->s_sending_psn) < 0 || |
1838 | cmp_psn(a: qp->s_sending_psn, b: qp->s_sending_hpsn) > 0) { |
1839 | trdma_clean_swqe(qp, wqe); |
1840 | trace_hfi1_qp_send_completion(qp, wqe, idx: qp->s_last); |
1841 | rvt_qp_complete_swqe(qp, |
1842 | wqe, |
1843 | opcode: ib_hfi1_wc_opcode[wqe->wr.opcode], |
1844 | status: IB_WC_SUCCESS); |
1845 | } else { |
1846 | struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); |
1847 | |
1848 | this_cpu_inc(*ibp->rvp.rc_delayed_comp); |
1849 | /* |
1850 | * If send progress not running attempt to progress |
1851 | * SDMA queue. |
1852 | */ |
1853 | if (ppd->dd->flags & HFI1_HAS_SEND_DMA) { |
1854 | struct sdma_engine *engine; |
1855 | u8 sl = rdma_ah_get_sl(attr: &qp->remote_ah_attr); |
1856 | u8 sc5; |
1857 | |
1858 | /* For now use sc to find engine */ |
1859 | sc5 = ibp->sl_to_sc[sl]; |
1860 | engine = qp_to_sdma_engine(qp, sc5); |
1861 | sdma_engine_progress_schedule(sde: engine); |
1862 | } |
1863 | } |
1864 | |
1865 | qp->s_retry = qp->s_retry_cnt; |
1866 | /* |
1867 | * Don't update the last PSN if the request being completed is |
1868 | * a TID RDMA WRITE request. |
1869 | * Completion of the TID RDMA WRITE requests are done by the |
1870 | * TID RDMA ACKs and as such could be for a request that has |
1871 | * already been ACKed as far as the IB state machine is |
1872 | * concerned. |
1873 | */ |
1874 | if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) |
1875 | update_last_psn(qp, psn: wqe->lpsn); |
1876 | |
1877 | /* |
1878 | * If we are completing a request which is in the process of |
1879 | * being resent, we can stop re-sending it since we know the |
1880 | * responder has already seen it. |
1881 | */ |
1882 | if (qp->s_acked == qp->s_cur) { |
1883 | if (++qp->s_cur >= qp->s_size) |
1884 | qp->s_cur = 0; |
1885 | qp->s_acked = qp->s_cur; |
1886 | wqe = rvt_get_swqe_ptr(qp, n: qp->s_cur); |
1887 | if (qp->s_acked != qp->s_tail) { |
1888 | qp->s_state = OP(SEND_LAST); |
1889 | qp->s_psn = wqe->psn; |
1890 | } |
1891 | } else { |
1892 | if (++qp->s_acked >= qp->s_size) |
1893 | qp->s_acked = 0; |
1894 | if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur) |
1895 | qp->s_draining = 0; |
1896 | wqe = rvt_get_swqe_ptr(qp, n: qp->s_acked); |
1897 | } |
1898 | if (priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) { |
1899 | priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK; |
1900 | hfi1_schedule_send(qp); |
1901 | } |
1902 | return wqe; |
1903 | } |
1904 | |
1905 | static void set_restart_qp(struct rvt_qp *qp, struct hfi1_ctxtdata *rcd) |
1906 | { |
1907 | /* Retry this request. */ |
1908 | if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { |
1909 | qp->r_flags |= RVT_R_RDMAR_SEQ; |
1910 | hfi1_restart_rc(qp, psn: qp->s_last_psn + 1, wait: 0); |
1911 | if (list_empty(head: &qp->rspwait)) { |
1912 | qp->r_flags |= RVT_R_RSP_SEND; |
1913 | rvt_get_qp(qp); |
1914 | list_add_tail(new: &qp->rspwait, head: &rcd->qp_wait_list); |
1915 | } |
1916 | } |
1917 | } |
1918 | |
1919 | /** |
1920 | * update_qp_retry_state - Update qp retry state. |
1921 | * @qp: the QP |
1922 | * @psn: the packet sequence number of the TID RDMA WRITE RESP. |
1923 | * @spsn: The start psn for the given TID RDMA WRITE swqe. |
1924 | * @lpsn: The last psn for the given TID RDMA WRITE swqe. |
1925 | * |
1926 | * This function is called to update the qp retry state upon |
1927 | * receiving a TID WRITE RESP after the qp is scheduled to retry |
1928 | * a request. |
1929 | */ |
1930 | static void update_qp_retry_state(struct rvt_qp *qp, u32 psn, u32 spsn, |
1931 | u32 lpsn) |
1932 | { |
1933 | struct hfi1_qp_priv *qpriv = qp->priv; |
1934 | |
1935 | qp->s_psn = psn + 1; |
1936 | /* |
1937 | * If this is the first TID RDMA WRITE RESP packet for the current |
1938 | * request, change the s_state so that the retry will be processed |
1939 | * correctly. Similarly, if this is the last TID RDMA WRITE RESP |
1940 | * packet, change the s_state and advance the s_cur. |
1941 | */ |
1942 | if (cmp_psn(a: psn, b: lpsn) >= 0) { |
1943 | qp->s_cur = qpriv->s_tid_cur + 1; |
1944 | if (qp->s_cur >= qp->s_size) |
1945 | qp->s_cur = 0; |
1946 | qp->s_state = TID_OP(WRITE_REQ); |
1947 | } else if (!cmp_psn(a: psn, b: spsn)) { |
1948 | qp->s_cur = qpriv->s_tid_cur; |
1949 | qp->s_state = TID_OP(WRITE_RESP); |
1950 | } |
1951 | } |
1952 | |
1953 | /* |
1954 | * do_rc_ack - process an incoming RC ACK |
1955 | * @qp: the QP the ACK came in on |
1956 | * @psn: the packet sequence number of the ACK |
1957 | * @opcode: the opcode of the request that resulted in the ACK |
1958 | * |
1959 | * This is called from rc_rcv_resp() to process an incoming RC ACK |
1960 | * for the given QP. |
1961 | * May be called at interrupt level, with the QP s_lock held. |
1962 | * Returns 1 if OK, 0 if current operation should be aborted (NAK). |
1963 | */ |
1964 | int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, |
1965 | u64 val, struct hfi1_ctxtdata *rcd) |
1966 | { |
1967 | struct hfi1_ibport *ibp; |
1968 | enum ib_wc_status status; |
1969 | struct hfi1_qp_priv *qpriv = qp->priv; |
1970 | struct rvt_swqe *wqe; |
1971 | int ret = 0; |
1972 | u32 ack_psn; |
1973 | int diff; |
1974 | struct rvt_dev_info *rdi; |
1975 | |
1976 | lockdep_assert_held(&qp->s_lock); |
1977 | /* |
1978 | * Note that NAKs implicitly ACK outstanding SEND and RDMA write |
1979 | * requests and implicitly NAK RDMA read and atomic requests issued |
1980 | * before the NAK'ed request. The MSN won't include the NAK'ed |
1981 | * request but will include an ACK'ed request(s). |
1982 | */ |
1983 | ack_psn = psn; |
1984 | if (aeth >> IB_AETH_NAK_SHIFT) |
1985 | ack_psn--; |
1986 | wqe = rvt_get_swqe_ptr(qp, n: qp->s_acked); |
1987 | ibp = rcd_to_iport(rcd); |
1988 | |
1989 | /* |
1990 | * The MSN might be for a later WQE than the PSN indicates so |
1991 | * only complete WQEs that the PSN finishes. |
1992 | */ |
1993 | while ((diff = delta_psn(a: ack_psn, b: wqe->lpsn)) >= 0) { |
1994 | /* |
1995 | * RDMA_READ_RESPONSE_ONLY is a special case since |
1996 | * we want to generate completion events for everything |
1997 | * before the RDMA read, copy the data, then generate |
1998 | * the completion for the read. |
1999 | */ |
2000 | if (wqe->wr.opcode == IB_WR_RDMA_READ && |
2001 | opcode == OP(RDMA_READ_RESPONSE_ONLY) && |
2002 | diff == 0) { |
2003 | ret = 1; |
2004 | goto bail_stop; |
2005 | } |
2006 | /* |
2007 | * If this request is a RDMA read or atomic, and the ACK is |
2008 | * for a later operation, this ACK NAKs the RDMA read or |
2009 | * atomic. In other words, only a RDMA_READ_LAST or ONLY |
2010 | * can ACK a RDMA read and likewise for atomic ops. Note |
2011 | * that the NAK case can only happen if relaxed ordering is |
2012 | * used and requests are sent after an RDMA read or atomic |
2013 | * is sent but before the response is received. |
2014 | */ |
2015 | if ((wqe->wr.opcode == IB_WR_RDMA_READ && |
2016 | (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) || |
2017 | (wqe->wr.opcode == IB_WR_TID_RDMA_READ && |
2018 | (opcode != TID_OP(READ_RESP) || diff != 0)) || |
2019 | ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || |
2020 | wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && |
2021 | (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0)) || |
2022 | (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && |
2023 | (delta_psn(a: psn, b: qp->s_last_psn) != 1))) { |
2024 | set_restart_qp(qp, rcd); |
2025 | /* |
2026 | * No need to process the ACK/NAK since we are |
2027 | * restarting an earlier request. |
2028 | */ |
2029 | goto bail_stop; |
2030 | } |
2031 | if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || |
2032 | wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { |
2033 | u64 *vaddr = wqe->sg_list[0].vaddr; |
2034 | *vaddr = val; |
2035 | } |
2036 | if (wqe->wr.opcode == IB_WR_OPFN) |
2037 | opfn_conn_reply(qp, data: val); |
2038 | |
2039 | if (qp->s_num_rd_atomic && |
2040 | (wqe->wr.opcode == IB_WR_RDMA_READ || |
2041 | wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || |
2042 | wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) { |
2043 | qp->s_num_rd_atomic--; |
2044 | /* Restart sending task if fence is complete */ |
2045 | if ((qp->s_flags & RVT_S_WAIT_FENCE) && |
2046 | !qp->s_num_rd_atomic) { |
2047 | qp->s_flags &= ~(RVT_S_WAIT_FENCE | |
2048 | RVT_S_WAIT_ACK); |
2049 | hfi1_schedule_send(qp); |
2050 | } else if (qp->s_flags & RVT_S_WAIT_RDMAR) { |
2051 | qp->s_flags &= ~(RVT_S_WAIT_RDMAR | |
2052 | RVT_S_WAIT_ACK); |
2053 | hfi1_schedule_send(qp); |
2054 | } |
2055 | } |
2056 | |
2057 | /* |
2058 | * TID RDMA WRITE requests will be completed by the TID RDMA |
2059 | * ACK packet handler (see tid_rdma.c). |
2060 | */ |
2061 | if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) |
2062 | break; |
2063 | |
2064 | wqe = do_rc_completion(qp, wqe, ibp); |
2065 | if (qp->s_acked == qp->s_tail) |
2066 | break; |
2067 | } |
2068 | |
2069 | trace_hfi1_rc_ack_do(qp, aeth, psn, wqe); |
2070 | trace_hfi1_sender_do_rc_ack(qp); |
2071 | switch (aeth >> IB_AETH_NAK_SHIFT) { |
2072 | case 0: /* ACK */ |
2073 | this_cpu_inc(*ibp->rvp.rc_acks); |
2074 | if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { |
2075 | if (wqe_to_tid_req(wqe)->ack_pending) |
2076 | rvt_mod_retry_timer_ext(qp, |
2077 | shift: qpriv->timeout_shift); |
2078 | else |
2079 | rvt_stop_rc_timers(qp); |
2080 | } else if (qp->s_acked != qp->s_tail) { |
2081 | struct rvt_swqe *__w = NULL; |
2082 | |
2083 | if (qpriv->s_tid_cur != HFI1_QP_WQE_INVALID) |
2084 | __w = rvt_get_swqe_ptr(qp, n: qpriv->s_tid_cur); |
2085 | |
2086 | /* |
2087 | * Stop timers if we've received all of the TID RDMA |
2088 | * WRITE * responses. |
2089 | */ |
2090 | if (__w && __w->wr.opcode == IB_WR_TID_RDMA_WRITE && |
2091 | opcode == TID_OP(WRITE_RESP)) { |
2092 | /* |
2093 | * Normally, the loop above would correctly |
2094 | * process all WQEs from s_acked onward and |
2095 | * either complete them or check for correct |
2096 | * PSN sequencing. |
2097 | * However, for TID RDMA, due to pipelining, |
2098 | * the response may not be for the request at |
2099 | * s_acked so the above look would just be |
2100 | * skipped. This does not allow for checking |
2101 | * the PSN sequencing. It has to be done |
2102 | * separately. |
2103 | */ |
2104 | if (cmp_psn(a: psn, b: qp->s_last_psn + 1)) { |
2105 | set_restart_qp(qp, rcd); |
2106 | goto bail_stop; |
2107 | } |
2108 | /* |
2109 | * If the psn is being resent, stop the |
2110 | * resending. |
2111 | */ |
2112 | if (qp->s_cur != qp->s_tail && |
2113 | cmp_psn(a: qp->s_psn, b: psn) <= 0) |
2114 | update_qp_retry_state(qp, psn, |
2115 | spsn: __w->psn, |
2116 | lpsn: __w->lpsn); |
2117 | else if (--qpriv->pending_tid_w_resp) |
2118 | rvt_mod_retry_timer(qp); |
2119 | else |
2120 | rvt_stop_rc_timers(qp); |
2121 | } else { |
2122 | /* |
2123 | * We are expecting more ACKs so |
2124 | * mod the retry timer. |
2125 | */ |
2126 | rvt_mod_retry_timer(qp); |
2127 | /* |
2128 | * We can stop re-sending the earlier packets |
2129 | * and continue with the next packet the |
2130 | * receiver wants. |
2131 | */ |
2132 | if (cmp_psn(a: qp->s_psn, b: psn) <= 0) |
2133 | reset_psn(qp, psn: psn + 1); |
2134 | } |
2135 | } else { |
2136 | /* No more acks - kill all timers */ |
2137 | rvt_stop_rc_timers(qp); |
2138 | if (cmp_psn(a: qp->s_psn, b: psn) <= 0) { |
2139 | qp->s_state = OP(SEND_LAST); |
2140 | qp->s_psn = psn + 1; |
2141 | } |
2142 | } |
2143 | if (qp->s_flags & RVT_S_WAIT_ACK) { |
2144 | qp->s_flags &= ~RVT_S_WAIT_ACK; |
2145 | hfi1_schedule_send(qp); |
2146 | } |
2147 | rvt_get_credit(qp, aeth); |
2148 | qp->s_rnr_retry = qp->s_rnr_retry_cnt; |
2149 | qp->s_retry = qp->s_retry_cnt; |
2150 | /* |
2151 | * If the current request is a TID RDMA WRITE request and the |
2152 | * response is not a TID RDMA WRITE RESP packet, s_last_psn |
2153 | * can't be advanced. |
2154 | */ |
2155 | if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && |
2156 | opcode != TID_OP(WRITE_RESP) && |
2157 | cmp_psn(a: psn, b: wqe->psn) >= 0) |
2158 | return 1; |
2159 | update_last_psn(qp, psn); |
2160 | return 1; |
2161 | |
2162 | case 1: /* RNR NAK */ |
2163 | ibp->rvp.n_rnr_naks++; |
2164 | if (qp->s_acked == qp->s_tail) |
2165 | goto bail_stop; |
2166 | if (qp->s_flags & RVT_S_WAIT_RNR) |
2167 | goto bail_stop; |
2168 | rdi = ib_to_rvt(ibdev: qp->ibqp.device); |
2169 | if (!(rdi->post_parms[wqe->wr.opcode].flags & |
2170 | RVT_OPERATION_IGN_RNR_CNT)) { |
2171 | if (qp->s_rnr_retry == 0) { |
2172 | status = IB_WC_RNR_RETRY_EXC_ERR; |
2173 | goto class_b; |
2174 | } |
2175 | if (qp->s_rnr_retry_cnt < 7 && qp->s_rnr_retry_cnt > 0) |
2176 | qp->s_rnr_retry--; |
2177 | } |
2178 | |
2179 | /* |
2180 | * The last valid PSN is the previous PSN. For TID RDMA WRITE |
2181 | * request, s_last_psn should be incremented only when a TID |
2182 | * RDMA WRITE RESP is received to avoid skipping lost TID RDMA |
2183 | * WRITE RESP packets. |
2184 | */ |
2185 | if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { |
2186 | reset_psn(qp, psn: qp->s_last_psn + 1); |
2187 | } else { |
2188 | update_last_psn(qp, psn: psn - 1); |
2189 | reset_psn(qp, psn); |
2190 | } |
2191 | |
2192 | ibp->rvp.n_rc_resends += delta_psn(a: qp->s_psn, b: psn); |
2193 | qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK); |
2194 | rvt_stop_rc_timers(qp); |
2195 | rvt_add_rnr_timer(qp, aeth); |
2196 | return 0; |
2197 | |
2198 | case 3: /* NAK */ |
2199 | if (qp->s_acked == qp->s_tail) |
2200 | goto bail_stop; |
2201 | /* The last valid PSN is the previous PSN. */ |
2202 | update_last_psn(qp, psn: psn - 1); |
2203 | switch ((aeth >> IB_AETH_CREDIT_SHIFT) & |
2204 | IB_AETH_CREDIT_MASK) { |
2205 | case 0: /* PSN sequence error */ |
2206 | ibp->rvp.n_seq_naks++; |
2207 | /* |
2208 | * Back up to the responder's expected PSN. |
2209 | * Note that we might get a NAK in the middle of an |
2210 | * RDMA READ response which terminates the RDMA |
2211 | * READ. |
2212 | */ |
2213 | hfi1_restart_rc(qp, psn, wait: 0); |
2214 | hfi1_schedule_send(qp); |
2215 | break; |
2216 | |
2217 | case 1: /* Invalid Request */ |
2218 | status = IB_WC_REM_INV_REQ_ERR; |
2219 | ibp->rvp.n_other_naks++; |
2220 | goto class_b; |
2221 | |
2222 | case 2: /* Remote Access Error */ |
2223 | status = IB_WC_REM_ACCESS_ERR; |
2224 | ibp->rvp.n_other_naks++; |
2225 | goto class_b; |
2226 | |
2227 | case 3: /* Remote Operation Error */ |
2228 | status = IB_WC_REM_OP_ERR; |
2229 | ibp->rvp.n_other_naks++; |
2230 | class_b: |
2231 | if (qp->s_last == qp->s_acked) { |
2232 | if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) |
2233 | hfi1_kern_read_tid_flow_free(qp); |
2234 | |
2235 | hfi1_trdma_send_complete(qp, wqe, status); |
2236 | rvt_error_qp(qp, err: IB_WC_WR_FLUSH_ERR); |
2237 | } |
2238 | break; |
2239 | |
2240 | default: |
2241 | /* Ignore other reserved NAK error codes */ |
2242 | goto reserved; |
2243 | } |
2244 | qp->s_retry = qp->s_retry_cnt; |
2245 | qp->s_rnr_retry = qp->s_rnr_retry_cnt; |
2246 | goto bail_stop; |
2247 | |
2248 | default: /* 2: reserved */ |
2249 | reserved: |
2250 | /* Ignore reserved NAK codes. */ |
2251 | goto bail_stop; |
2252 | } |
2253 | /* cannot be reached */ |
2254 | bail_stop: |
2255 | rvt_stop_rc_timers(qp); |
2256 | return ret; |
2257 | } |
2258 | |
2259 | /* |
2260 | * We have seen an out of sequence RDMA read middle or last packet. |
2261 | * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE. |
2262 | */ |
2263 | static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn, |
2264 | struct hfi1_ctxtdata *rcd) |
2265 | { |
2266 | struct rvt_swqe *wqe; |
2267 | |
2268 | lockdep_assert_held(&qp->s_lock); |
2269 | /* Remove QP from retry timer */ |
2270 | rvt_stop_rc_timers(qp); |
2271 | |
2272 | wqe = rvt_get_swqe_ptr(qp, n: qp->s_acked); |
2273 | |
2274 | while (cmp_psn(a: psn, b: wqe->lpsn) > 0) { |
2275 | if (wqe->wr.opcode == IB_WR_RDMA_READ || |
2276 | wqe->wr.opcode == IB_WR_TID_RDMA_READ || |
2277 | wqe->wr.opcode == IB_WR_TID_RDMA_WRITE || |
2278 | wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || |
2279 | wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) |
2280 | break; |
2281 | wqe = do_rc_completion(qp, wqe, ibp); |
2282 | } |
2283 | |
2284 | ibp->rvp.n_rdma_seq++; |
2285 | qp->r_flags |= RVT_R_RDMAR_SEQ; |
2286 | hfi1_restart_rc(qp, psn: qp->s_last_psn + 1, wait: 0); |
2287 | if (list_empty(head: &qp->rspwait)) { |
2288 | qp->r_flags |= RVT_R_RSP_SEND; |
2289 | rvt_get_qp(qp); |
2290 | list_add_tail(new: &qp->rspwait, head: &rcd->qp_wait_list); |
2291 | } |
2292 | } |
2293 | |
2294 | /** |
2295 | * rc_rcv_resp - process an incoming RC response packet |
2296 | * @packet: data packet information |
2297 | * |
2298 | * This is called from hfi1_rc_rcv() to process an incoming RC response |
2299 | * packet for the given QP. |
2300 | * Called at interrupt level. |
2301 | */ |
2302 | static void rc_rcv_resp(struct hfi1_packet *packet) |
2303 | { |
2304 | struct hfi1_ctxtdata *rcd = packet->rcd; |
2305 | void *data = packet->payload; |
2306 | u32 tlen = packet->tlen; |
2307 | struct rvt_qp *qp = packet->qp; |
2308 | struct hfi1_ibport *ibp; |
2309 | struct ib_other_headers *ohdr = packet->ohdr; |
2310 | struct rvt_swqe *wqe; |
2311 | enum ib_wc_status status; |
2312 | unsigned long flags; |
2313 | int diff; |
2314 | u64 val; |
2315 | u32 aeth; |
2316 | u32 psn = ib_bth_get_psn(ohdr: packet->ohdr); |
2317 | u32 pmtu = qp->pmtu; |
2318 | u16 hdrsize = packet->hlen; |
2319 | u8 opcode = packet->opcode; |
2320 | u8 pad = packet->pad; |
2321 | u8 = pad + packet->extra_byte + (SIZE_OF_CRC << 2); |
2322 | |
2323 | spin_lock_irqsave(&qp->s_lock, flags); |
2324 | trace_hfi1_ack(qp, psn); |
2325 | |
2326 | /* Ignore invalid responses. */ |
2327 | if (cmp_psn(a: psn, READ_ONCE(qp->s_next_psn)) >= 0) |
2328 | goto ack_done; |
2329 | |
2330 | /* Ignore duplicate responses. */ |
2331 | diff = cmp_psn(a: psn, b: qp->s_last_psn); |
2332 | if (unlikely(diff <= 0)) { |
2333 | /* Update credits for "ghost" ACKs */ |
2334 | if (diff == 0 && opcode == OP(ACKNOWLEDGE)) { |
2335 | aeth = be32_to_cpu(ohdr->u.aeth); |
2336 | if ((aeth >> IB_AETH_NAK_SHIFT) == 0) |
2337 | rvt_get_credit(qp, aeth); |
2338 | } |
2339 | goto ack_done; |
2340 | } |
2341 | |
2342 | /* |
2343 | * Skip everything other than the PSN we expect, if we are waiting |
2344 | * for a reply to a restarted RDMA read or atomic op. |
2345 | */ |
2346 | if (qp->r_flags & RVT_R_RDMAR_SEQ) { |
2347 | if (cmp_psn(a: psn, b: qp->s_last_psn + 1) != 0) |
2348 | goto ack_done; |
2349 | qp->r_flags &= ~RVT_R_RDMAR_SEQ; |
2350 | } |
2351 | |
2352 | if (unlikely(qp->s_acked == qp->s_tail)) |
2353 | goto ack_done; |
2354 | wqe = rvt_get_swqe_ptr(qp, n: qp->s_acked); |
2355 | status = IB_WC_SUCCESS; |
2356 | |
2357 | switch (opcode) { |
2358 | case OP(ACKNOWLEDGE): |
2359 | case OP(ATOMIC_ACKNOWLEDGE): |
2360 | case OP(RDMA_READ_RESPONSE_FIRST): |
2361 | aeth = be32_to_cpu(ohdr->u.aeth); |
2362 | if (opcode == OP(ATOMIC_ACKNOWLEDGE)) |
2363 | val = ib_u64_get(p: &ohdr->u.at.atomic_ack_eth); |
2364 | else |
2365 | val = 0; |
2366 | if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) || |
2367 | opcode != OP(RDMA_READ_RESPONSE_FIRST)) |
2368 | goto ack_done; |
2369 | wqe = rvt_get_swqe_ptr(qp, n: qp->s_acked); |
2370 | if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) |
2371 | goto ack_op_err; |
2372 | /* |
2373 | * If this is a response to a resent RDMA read, we |
2374 | * have to be careful to copy the data to the right |
2375 | * location. |
2376 | */ |
2377 | qp->s_rdma_read_len = restart_sge(ss: &qp->s_rdma_read_sge, |
2378 | wqe, psn, pmtu); |
2379 | goto read_middle; |
2380 | |
2381 | case OP(RDMA_READ_RESPONSE_MIDDLE): |
2382 | /* no AETH, no ACK */ |
2383 | if (unlikely(cmp_psn(psn, qp->s_last_psn + 1))) |
2384 | goto ack_seq_err; |
2385 | if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) |
2386 | goto ack_op_err; |
2387 | read_middle: |
2388 | if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) |
2389 | goto ack_len_err; |
2390 | if (unlikely(pmtu >= qp->s_rdma_read_len)) |
2391 | goto ack_len_err; |
2392 | |
2393 | /* |
2394 | * We got a response so update the timeout. |
2395 | * 4.096 usec. * (1 << qp->timeout) |
2396 | */ |
2397 | rvt_mod_retry_timer(qp); |
2398 | if (qp->s_flags & RVT_S_WAIT_ACK) { |
2399 | qp->s_flags &= ~RVT_S_WAIT_ACK; |
2400 | hfi1_schedule_send(qp); |
2401 | } |
2402 | |
2403 | if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE)) |
2404 | qp->s_retry = qp->s_retry_cnt; |
2405 | |
2406 | /* |
2407 | * Update the RDMA receive state but do the copy w/o |
2408 | * holding the locks and blocking interrupts. |
2409 | */ |
2410 | qp->s_rdma_read_len -= pmtu; |
2411 | update_last_psn(qp, psn); |
2412 | spin_unlock_irqrestore(lock: &qp->s_lock, flags); |
2413 | rvt_copy_sge(qp, ss: &qp->s_rdma_read_sge, |
2414 | data, length: pmtu, release: false, copy_last: false); |
2415 | goto bail; |
2416 | |
2417 | case OP(RDMA_READ_RESPONSE_ONLY): |
2418 | aeth = be32_to_cpu(ohdr->u.aeth); |
2419 | if (!do_rc_ack(qp, aeth, psn, opcode, val: 0, rcd)) |
2420 | goto ack_done; |
2421 | /* |
2422 | * Check that the data size is >= 0 && <= pmtu. |
2423 | * Remember to account for ICRC (4). |
2424 | */ |
2425 | if (unlikely(tlen < (hdrsize + extra_bytes))) |
2426 | goto ack_len_err; |
2427 | /* |
2428 | * If this is a response to a resent RDMA read, we |
2429 | * have to be careful to copy the data to the right |
2430 | * location. |
2431 | */ |
2432 | wqe = rvt_get_swqe_ptr(qp, n: qp->s_acked); |
2433 | qp->s_rdma_read_len = restart_sge(ss: &qp->s_rdma_read_sge, |
2434 | wqe, psn, pmtu); |
2435 | goto read_last; |
2436 | |
2437 | case OP(RDMA_READ_RESPONSE_LAST): |
2438 | /* ACKs READ req. */ |
2439 | if (unlikely(cmp_psn(psn, qp->s_last_psn + 1))) |
2440 | goto ack_seq_err; |
2441 | if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) |
2442 | goto ack_op_err; |
2443 | /* |
2444 | * Check that the data size is >= 1 && <= pmtu. |
2445 | * Remember to account for ICRC (4). |
2446 | */ |
2447 | if (unlikely(tlen <= (hdrsize + extra_bytes))) |
2448 | goto ack_len_err; |
2449 | read_last: |
2450 | tlen -= hdrsize + extra_bytes; |
2451 | if (unlikely(tlen != qp->s_rdma_read_len)) |
2452 | goto ack_len_err; |
2453 | aeth = be32_to_cpu(ohdr->u.aeth); |
2454 | rvt_copy_sge(qp, ss: &qp->s_rdma_read_sge, |
2455 | data, length: tlen, release: false, copy_last: false); |
2456 | WARN_ON(qp->s_rdma_read_sge.num_sge); |
2457 | (void)do_rc_ack(qp, aeth, psn, |
2458 | OP(RDMA_READ_RESPONSE_LAST), val: 0, rcd); |
2459 | goto ack_done; |
2460 | } |
2461 | |
2462 | ack_op_err: |
2463 | status = IB_WC_LOC_QP_OP_ERR; |
2464 | goto ack_err; |
2465 | |
2466 | ack_seq_err: |
2467 | ibp = rcd_to_iport(rcd); |
2468 | rdma_seq_err(qp, ibp, psn, rcd); |
2469 | goto ack_done; |
2470 | |
2471 | ack_len_err: |
2472 | status = IB_WC_LOC_LEN_ERR; |
2473 | ack_err: |
2474 | if (qp->s_last == qp->s_acked) { |
2475 | rvt_send_complete(qp, wqe, status); |
2476 | rvt_error_qp(qp, err: IB_WC_WR_FLUSH_ERR); |
2477 | } |
2478 | ack_done: |
2479 | spin_unlock_irqrestore(lock: &qp->s_lock, flags); |
2480 | bail: |
2481 | return; |
2482 | } |
2483 | |
2484 | static inline void rc_cancel_ack(struct rvt_qp *qp) |
2485 | { |
2486 | qp->r_adefered = 0; |
2487 | if (list_empty(head: &qp->rspwait)) |
2488 | return; |
2489 | list_del_init(entry: &qp->rspwait); |
2490 | qp->r_flags &= ~RVT_R_RSP_NAK; |
2491 | rvt_put_qp(qp); |
2492 | } |
2493 | |
2494 | /** |
2495 | * rc_rcv_error - process an incoming duplicate or error RC packet |
2496 | * @ohdr: the other headers for this packet |
2497 | * @data: the packet data |
2498 | * @qp: the QP for this packet |
2499 | * @opcode: the opcode for this packet |
2500 | * @psn: the packet sequence number for this packet |
2501 | * @diff: the difference between the PSN and the expected PSN |
2502 | * @rcd: the receive context |
2503 | * |
2504 | * This is called from hfi1_rc_rcv() to process an unexpected |
2505 | * incoming RC packet for the given QP. |
2506 | * Called at interrupt level. |
2507 | * Return 1 if no more processing is needed; otherwise return 0 to |
2508 | * schedule a response to be sent. |
2509 | */ |
2510 | static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, |
2511 | struct rvt_qp *qp, u32 opcode, u32 psn, |
2512 | int diff, struct hfi1_ctxtdata *rcd) |
2513 | { |
2514 | struct hfi1_ibport *ibp = rcd_to_iport(rcd); |
2515 | struct rvt_ack_entry *e; |
2516 | unsigned long flags; |
2517 | u8 prev; |
2518 | u8 mra; /* most recent ACK */ |
2519 | bool old_req; |
2520 | |
2521 | trace_hfi1_rcv_error(qp, psn); |
2522 | if (diff > 0) { |
2523 | /* |
2524 | * Packet sequence error. |
2525 | * A NAK will ACK earlier sends and RDMA writes. |
2526 | * Don't queue the NAK if we already sent one. |
2527 | */ |
2528 | if (!qp->r_nak_state) { |
2529 | ibp->rvp.n_rc_seqnak++; |
2530 | qp->r_nak_state = IB_NAK_PSN_ERROR; |
2531 | /* Use the expected PSN. */ |
2532 | qp->r_ack_psn = qp->r_psn; |
2533 | /* |
2534 | * Wait to send the sequence NAK until all packets |
2535 | * in the receive queue have been processed. |
2536 | * Otherwise, we end up propagating congestion. |
2537 | */ |
2538 | rc_defered_ack(rcd, qp); |
2539 | } |
2540 | goto done; |
2541 | } |
2542 | |
2543 | /* |
2544 | * Handle a duplicate request. Don't re-execute SEND, RDMA |
2545 | * write or atomic op. Don't NAK errors, just silently drop |
2546 | * the duplicate request. Note that r_sge, r_len, and |
2547 | * r_rcv_len may be in use so don't modify them. |
2548 | * |
2549 | * We are supposed to ACK the earliest duplicate PSN but we |
2550 | * can coalesce an outstanding duplicate ACK. We have to |
2551 | * send the earliest so that RDMA reads can be restarted at |
2552 | * the requester's expected PSN. |
2553 | * |
2554 | * First, find where this duplicate PSN falls within the |
2555 | * ACKs previously sent. |
2556 | * old_req is true if there is an older response that is scheduled |
2557 | * to be sent before sending this one. |
2558 | */ |
2559 | e = NULL; |
2560 | old_req = true; |
2561 | ibp->rvp.n_rc_dupreq++; |
2562 | |
2563 | spin_lock_irqsave(&qp->s_lock, flags); |
2564 | |
2565 | e = find_prev_entry(qp, psn, prev: &prev, prev_ack: &mra, scheduled: &old_req); |
2566 | |
2567 | switch (opcode) { |
2568 | case OP(RDMA_READ_REQUEST): { |
2569 | struct ib_reth *reth; |
2570 | u32 offset; |
2571 | u32 len; |
2572 | |
2573 | /* |
2574 | * If we didn't find the RDMA read request in the ack queue, |
2575 | * we can ignore this request. |
2576 | */ |
2577 | if (!e || e->opcode != OP(RDMA_READ_REQUEST)) |
2578 | goto unlock_done; |
2579 | /* RETH comes after BTH */ |
2580 | reth = &ohdr->u.rc.reth; |
2581 | /* |
2582 | * Address range must be a subset of the original |
2583 | * request and start on pmtu boundaries. |
2584 | * We reuse the old ack_queue slot since the requester |
2585 | * should not back up and request an earlier PSN for the |
2586 | * same request. |
2587 | */ |
2588 | offset = delta_psn(a: psn, b: e->psn) * qp->pmtu; |
2589 | len = be32_to_cpu(reth->length); |
2590 | if (unlikely(offset + len != e->rdma_sge.sge_length)) |
2591 | goto unlock_done; |
2592 | release_rdma_sge_mr(e); |
2593 | if (len != 0) { |
2594 | u32 rkey = be32_to_cpu(reth->rkey); |
2595 | u64 vaddr = get_ib_reth_vaddr(reth); |
2596 | int ok; |
2597 | |
2598 | ok = rvt_rkey_ok(qp, sge: &e->rdma_sge, len, vaddr, rkey, |
2599 | acc: IB_ACCESS_REMOTE_READ); |
2600 | if (unlikely(!ok)) |
2601 | goto unlock_done; |
2602 | } else { |
2603 | e->rdma_sge.vaddr = NULL; |
2604 | e->rdma_sge.length = 0; |
2605 | e->rdma_sge.sge_length = 0; |
2606 | } |
2607 | e->psn = psn; |
2608 | if (old_req) |
2609 | goto unlock_done; |
2610 | if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) |
2611 | qp->s_acked_ack_queue = prev; |
2612 | qp->s_tail_ack_queue = prev; |
2613 | break; |
2614 | } |
2615 | |
2616 | case OP(COMPARE_SWAP): |
2617 | case OP(FETCH_ADD): { |
2618 | /* |
2619 | * If we didn't find the atomic request in the ack queue |
2620 | * or the send engine is already backed up to send an |
2621 | * earlier entry, we can ignore this request. |
2622 | */ |
2623 | if (!e || e->opcode != (u8)opcode || old_req) |
2624 | goto unlock_done; |
2625 | if (qp->s_tail_ack_queue == qp->s_acked_ack_queue) |
2626 | qp->s_acked_ack_queue = prev; |
2627 | qp->s_tail_ack_queue = prev; |
2628 | break; |
2629 | } |
2630 | |
2631 | default: |
2632 | /* |
2633 | * Ignore this operation if it doesn't request an ACK |
2634 | * or an earlier RDMA read or atomic is going to be resent. |
2635 | */ |
2636 | if (!(psn & IB_BTH_REQ_ACK) || old_req) |
2637 | goto unlock_done; |
2638 | /* |
2639 | * Resend the most recent ACK if this request is |
2640 | * after all the previous RDMA reads and atomics. |
2641 | */ |
2642 | if (mra == qp->r_head_ack_queue) { |
2643 | spin_unlock_irqrestore(lock: &qp->s_lock, flags); |
2644 | qp->r_nak_state = 0; |
2645 | qp->r_ack_psn = qp->r_psn - 1; |
2646 | goto send_ack; |
2647 | } |
2648 | |
2649 | /* |
2650 | * Resend the RDMA read or atomic op which |
2651 | * ACKs this duplicate request. |
2652 | */ |
2653 | if (qp->s_tail_ack_queue == qp->s_acked_ack_queue) |
2654 | qp->s_acked_ack_queue = mra; |
2655 | qp->s_tail_ack_queue = mra; |
2656 | break; |
2657 | } |
2658 | qp->s_ack_state = OP(ACKNOWLEDGE); |
2659 | qp->s_flags |= RVT_S_RESP_PENDING; |
2660 | qp->r_nak_state = 0; |
2661 | hfi1_schedule_send(qp); |
2662 | |
2663 | unlock_done: |
2664 | spin_unlock_irqrestore(lock: &qp->s_lock, flags); |
2665 | done: |
2666 | return 1; |
2667 | |
2668 | send_ack: |
2669 | return 0; |
2670 | } |
2671 | |
2672 | static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, |
2673 | u32 lqpn, u32 rqpn, u8 svc_type) |
2674 | { |
2675 | struct opa_hfi1_cong_log_event_internal *cc_event; |
2676 | unsigned long flags; |
2677 | |
2678 | if (sl >= OPA_MAX_SLS) |
2679 | return; |
2680 | |
2681 | spin_lock_irqsave(&ppd->cc_log_lock, flags); |
2682 | |
2683 | ppd->threshold_cong_event_map[sl / 8] |= 1 << (sl % 8); |
2684 | ppd->threshold_event_counter++; |
2685 | |
2686 | cc_event = &ppd->cc_events[ppd->cc_log_idx++]; |
2687 | if (ppd->cc_log_idx == OPA_CONG_LOG_ELEMS) |
2688 | ppd->cc_log_idx = 0; |
2689 | cc_event->lqpn = lqpn & RVT_QPN_MASK; |
2690 | cc_event->rqpn = rqpn & RVT_QPN_MASK; |
2691 | cc_event->sl = sl; |
2692 | cc_event->svc_type = svc_type; |
2693 | cc_event->rlid = rlid; |
2694 | /* keep timestamp in units of 1.024 usec */ |
2695 | cc_event->timestamp = ktime_get_ns() / 1024; |
2696 | |
2697 | spin_unlock_irqrestore(lock: &ppd->cc_log_lock, flags); |
2698 | } |
2699 | |
2700 | void process_becn(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, u32 lqpn, |
2701 | u32 rqpn, u8 svc_type) |
2702 | { |
2703 | struct cca_timer *cca_timer; |
2704 | u16 ccti, ccti_incr, ccti_timer, ccti_limit; |
2705 | u8 trigger_threshold; |
2706 | struct cc_state *cc_state; |
2707 | unsigned long flags; |
2708 | |
2709 | if (sl >= OPA_MAX_SLS) |
2710 | return; |
2711 | |
2712 | cc_state = get_cc_state(ppd); |
2713 | |
2714 | if (!cc_state) |
2715 | return; |
2716 | |
2717 | /* |
2718 | * 1) increase CCTI (for this SL) |
2719 | * 2) select IPG (i.e., call set_link_ipg()) |
2720 | * 3) start timer |
2721 | */ |
2722 | ccti_limit = cc_state->cct.ccti_limit; |
2723 | ccti_incr = cc_state->cong_setting.entries[sl].ccti_increase; |
2724 | ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer; |
2725 | trigger_threshold = |
2726 | cc_state->cong_setting.entries[sl].trigger_threshold; |
2727 | |
2728 | spin_lock_irqsave(&ppd->cca_timer_lock, flags); |
2729 | |
2730 | cca_timer = &ppd->cca_timer[sl]; |
2731 | if (cca_timer->ccti < ccti_limit) { |
2732 | if (cca_timer->ccti + ccti_incr <= ccti_limit) |
2733 | cca_timer->ccti += ccti_incr; |
2734 | else |
2735 | cca_timer->ccti = ccti_limit; |
2736 | set_link_ipg(ppd); |
2737 | } |
2738 | |
2739 | ccti = cca_timer->ccti; |
2740 | |
2741 | if (!hrtimer_active(timer: &cca_timer->hrtimer)) { |
2742 | /* ccti_timer is in units of 1.024 usec */ |
2743 | unsigned long nsec = 1024 * ccti_timer; |
2744 | |
2745 | hrtimer_start(timer: &cca_timer->hrtimer, tim: ns_to_ktime(ns: nsec), |
2746 | mode: HRTIMER_MODE_REL_PINNED); |
2747 | } |
2748 | |
2749 | spin_unlock_irqrestore(lock: &ppd->cca_timer_lock, flags); |
2750 | |
2751 | if ((trigger_threshold != 0) && (ccti >= trigger_threshold)) |
2752 | log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type); |
2753 | } |
2754 | |
2755 | /** |
2756 | * hfi1_rc_rcv - process an incoming RC packet |
2757 | * @packet: data packet information |
2758 | * |
2759 | * This is called from qp_rcv() to process an incoming RC packet |
2760 | * for the given QP. |
2761 | * May be called at interrupt level. |
2762 | */ |
2763 | void hfi1_rc_rcv(struct hfi1_packet *packet) |
2764 | { |
2765 | struct hfi1_ctxtdata *rcd = packet->rcd; |
2766 | void *data = packet->payload; |
2767 | u32 tlen = packet->tlen; |
2768 | struct rvt_qp *qp = packet->qp; |
2769 | struct hfi1_qp_priv *qpriv = qp->priv; |
2770 | struct hfi1_ibport *ibp = rcd_to_iport(rcd); |
2771 | struct ib_other_headers *ohdr = packet->ohdr; |
2772 | u32 opcode = packet->opcode; |
2773 | u32 hdrsize = packet->hlen; |
2774 | u32 psn = ib_bth_get_psn(ohdr: packet->ohdr); |
2775 | u32 pad = packet->pad; |
2776 | struct ib_wc wc; |
2777 | u32 pmtu = qp->pmtu; |
2778 | int diff; |
2779 | struct ib_reth *reth; |
2780 | unsigned long flags; |
2781 | int ret; |
2782 | bool copy_last = false, fecn; |
2783 | u32 rkey; |
2784 | u8 = pad + packet->extra_byte + (SIZE_OF_CRC << 2); |
2785 | |
2786 | lockdep_assert_held(&qp->r_lock); |
2787 | |
2788 | if (hfi1_ruc_check_hdr(ibp, packet)) |
2789 | return; |
2790 | |
2791 | fecn = process_ecn(qp, pkt: packet); |
2792 | opfn_trigger_conn_request(qp, be32_to_cpu(ohdr->bth[1])); |
2793 | |
2794 | /* |
2795 | * Process responses (ACKs) before anything else. Note that the |
2796 | * packet sequence number will be for something in the send work |
2797 | * queue rather than the expected receive packet sequence number. |
2798 | * In other words, this QP is the requester. |
2799 | */ |
2800 | if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && |
2801 | opcode <= OP(ATOMIC_ACKNOWLEDGE)) { |
2802 | rc_rcv_resp(packet); |
2803 | return; |
2804 | } |
2805 | |
2806 | /* Compute 24 bits worth of difference. */ |
2807 | diff = delta_psn(a: psn, b: qp->r_psn); |
2808 | if (unlikely(diff)) { |
2809 | if (rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd)) |
2810 | return; |
2811 | goto send_ack; |
2812 | } |
2813 | |
2814 | /* Check for opcode sequence errors. */ |
2815 | switch (qp->r_state) { |
2816 | case OP(SEND_FIRST): |
2817 | case OP(SEND_MIDDLE): |
2818 | if (opcode == OP(SEND_MIDDLE) || |
2819 | opcode == OP(SEND_LAST) || |
2820 | opcode == OP(SEND_LAST_WITH_IMMEDIATE) || |
2821 | opcode == OP(SEND_LAST_WITH_INVALIDATE)) |
2822 | break; |
2823 | goto nack_inv; |
2824 | |
2825 | case OP(RDMA_WRITE_FIRST): |
2826 | case OP(RDMA_WRITE_MIDDLE): |
2827 | if (opcode == OP(RDMA_WRITE_MIDDLE) || |
2828 | opcode == OP(RDMA_WRITE_LAST) || |
2829 | opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) |
2830 | break; |
2831 | goto nack_inv; |
2832 | |
2833 | default: |
2834 | if (opcode == OP(SEND_MIDDLE) || |
2835 | opcode == OP(SEND_LAST) || |
2836 | opcode == OP(SEND_LAST_WITH_IMMEDIATE) || |
2837 | opcode == OP(SEND_LAST_WITH_INVALIDATE) || |
2838 | opcode == OP(RDMA_WRITE_MIDDLE) || |
2839 | opcode == OP(RDMA_WRITE_LAST) || |
2840 | opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) |
2841 | goto nack_inv; |
2842 | /* |
2843 | * Note that it is up to the requester to not send a new |
2844 | * RDMA read or atomic operation before receiving an ACK |
2845 | * for the previous operation. |
2846 | */ |
2847 | break; |
2848 | } |
2849 | |
2850 | if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) |
2851 | rvt_comm_est(qp); |
2852 | |
2853 | /* OK, process the packet. */ |
2854 | switch (opcode) { |
2855 | case OP(SEND_FIRST): |
2856 | ret = rvt_get_rwqe(qp, wr_id_only: false); |
2857 | if (ret < 0) |
2858 | goto nack_op_err; |
2859 | if (!ret) |
2860 | goto rnr_nak; |
2861 | qp->r_rcv_len = 0; |
2862 | fallthrough; |
2863 | case OP(SEND_MIDDLE): |
2864 | case OP(RDMA_WRITE_MIDDLE): |
2865 | send_middle: |
2866 | /* Check for invalid length PMTU or posted rwqe len. */ |
2867 | /* |
2868 | * There will be no padding for 9B packet but 16B packets |
2869 | * will come in with some padding since we always add |
2870 | * CRC and LT bytes which will need to be flit aligned |
2871 | */ |
2872 | if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) |
2873 | goto nack_inv; |
2874 | qp->r_rcv_len += pmtu; |
2875 | if (unlikely(qp->r_rcv_len > qp->r_len)) |
2876 | goto nack_inv; |
2877 | rvt_copy_sge(qp, ss: &qp->r_sge, data, length: pmtu, release: true, copy_last: false); |
2878 | break; |
2879 | |
2880 | case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): |
2881 | /* consume RWQE */ |
2882 | ret = rvt_get_rwqe(qp, wr_id_only: true); |
2883 | if (ret < 0) |
2884 | goto nack_op_err; |
2885 | if (!ret) |
2886 | goto rnr_nak; |
2887 | goto send_last_imm; |
2888 | |
2889 | case OP(SEND_ONLY): |
2890 | case OP(SEND_ONLY_WITH_IMMEDIATE): |
2891 | case OP(SEND_ONLY_WITH_INVALIDATE): |
2892 | ret = rvt_get_rwqe(qp, wr_id_only: false); |
2893 | if (ret < 0) |
2894 | goto nack_op_err; |
2895 | if (!ret) |
2896 | goto rnr_nak; |
2897 | qp->r_rcv_len = 0; |
2898 | if (opcode == OP(SEND_ONLY)) |
2899 | goto no_immediate_data; |
2900 | if (opcode == OP(SEND_ONLY_WITH_INVALIDATE)) |
2901 | goto send_last_inv; |
2902 | fallthrough; /* for SEND_ONLY_WITH_IMMEDIATE */ |
2903 | case OP(SEND_LAST_WITH_IMMEDIATE): |
2904 | send_last_imm: |
2905 | wc.ex.imm_data = ohdr->u.imm_data; |
2906 | wc.wc_flags = IB_WC_WITH_IMM; |
2907 | goto send_last; |
2908 | case OP(SEND_LAST_WITH_INVALIDATE): |
2909 | send_last_inv: |
2910 | rkey = be32_to_cpu(ohdr->u.ieth); |
2911 | if (rvt_invalidate_rkey(qp, rkey)) |
2912 | goto no_immediate_data; |
2913 | wc.ex.invalidate_rkey = rkey; |
2914 | wc.wc_flags = IB_WC_WITH_INVALIDATE; |
2915 | goto send_last; |
2916 | case OP(RDMA_WRITE_LAST): |
2917 | copy_last = rvt_is_user_qp(qp); |
2918 | fallthrough; |
2919 | case OP(SEND_LAST): |
2920 | no_immediate_data: |
2921 | wc.wc_flags = 0; |
2922 | wc.ex.imm_data = 0; |
2923 | send_last: |
2924 | /* Check for invalid length. */ |
2925 | /* LAST len should be >= 1 */ |
2926 | if (unlikely(tlen < (hdrsize + extra_bytes))) |
2927 | goto nack_inv; |
2928 | /* Don't count the CRC(and padding and LT byte for 16B). */ |
2929 | tlen -= (hdrsize + extra_bytes); |
2930 | wc.byte_len = tlen + qp->r_rcv_len; |
2931 | if (unlikely(wc.byte_len > qp->r_len)) |
2932 | goto nack_inv; |
2933 | rvt_copy_sge(qp, ss: &qp->r_sge, data, length: tlen, release: true, copy_last); |
2934 | rvt_put_ss(ss: &qp->r_sge); |
2935 | qp->r_msn++; |
2936 | if (!__test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) |
2937 | break; |
2938 | wc.wr_id = qp->r_wr_id; |
2939 | wc.status = IB_WC_SUCCESS; |
2940 | if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) || |
2941 | opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) |
2942 | wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; |
2943 | else |
2944 | wc.opcode = IB_WC_RECV; |
2945 | wc.qp = &qp->ibqp; |
2946 | wc.src_qp = qp->remote_qpn; |
2947 | wc.slid = rdma_ah_get_dlid(attr: &qp->remote_ah_attr) & U16_MAX; |
2948 | /* |
2949 | * It seems that IB mandates the presence of an SL in a |
2950 | * work completion only for the UD transport (see section |
2951 | * 11.4.2 of IBTA Vol. 1). |
2952 | * |
2953 | * However, the way the SL is chosen below is consistent |
2954 | * with the way that IB/qib works and is trying avoid |
2955 | * introducing incompatibilities. |
2956 | * |
2957 | * See also OPA Vol. 1, section 9.7.6, and table 9-17. |
2958 | */ |
2959 | wc.sl = rdma_ah_get_sl(attr: &qp->remote_ah_attr); |
2960 | /* zero fields that are N/A */ |
2961 | wc.vendor_err = 0; |
2962 | wc.pkey_index = 0; |
2963 | wc.dlid_path_bits = 0; |
2964 | wc.port_num = 0; |
2965 | /* Signal completion event if the solicited bit is set. */ |
2966 | rvt_recv_cq(qp, wc: &wc, solicited: ib_bth_is_solicited(ohdr)); |
2967 | break; |
2968 | |
2969 | case OP(RDMA_WRITE_ONLY): |
2970 | copy_last = rvt_is_user_qp(qp); |
2971 | fallthrough; |
2972 | case OP(RDMA_WRITE_FIRST): |
2973 | case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): |
2974 | if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) |
2975 | goto nack_inv; |
2976 | /* consume RWQE */ |
2977 | reth = &ohdr->u.rc.reth; |
2978 | qp->r_len = be32_to_cpu(reth->length); |
2979 | qp->r_rcv_len = 0; |
2980 | qp->r_sge.sg_list = NULL; |
2981 | if (qp->r_len != 0) { |
2982 | u32 rkey = be32_to_cpu(reth->rkey); |
2983 | u64 vaddr = get_ib_reth_vaddr(reth); |
2984 | int ok; |
2985 | |
2986 | /* Check rkey & NAK */ |
2987 | ok = rvt_rkey_ok(qp, sge: &qp->r_sge.sge, len: qp->r_len, vaddr, |
2988 | rkey, acc: IB_ACCESS_REMOTE_WRITE); |
2989 | if (unlikely(!ok)) |
2990 | goto nack_acc; |
2991 | qp->r_sge.num_sge = 1; |
2992 | } else { |
2993 | qp->r_sge.num_sge = 0; |
2994 | qp->r_sge.sge.mr = NULL; |
2995 | qp->r_sge.sge.vaddr = NULL; |
2996 | qp->r_sge.sge.length = 0; |
2997 | qp->r_sge.sge.sge_length = 0; |
2998 | } |
2999 | if (opcode == OP(RDMA_WRITE_FIRST)) |
3000 | goto send_middle; |
3001 | else if (opcode == OP(RDMA_WRITE_ONLY)) |
3002 | goto no_immediate_data; |
3003 | ret = rvt_get_rwqe(qp, wr_id_only: true); |
3004 | if (ret < 0) |
3005 | goto nack_op_err; |
3006 | if (!ret) { |
3007 | /* peer will send again */ |
3008 | rvt_put_ss(ss: &qp->r_sge); |
3009 | goto rnr_nak; |
3010 | } |
3011 | wc.ex.imm_data = ohdr->u.rc.imm_data; |
3012 | wc.wc_flags = IB_WC_WITH_IMM; |
3013 | goto send_last; |
3014 | |
3015 | case OP(RDMA_READ_REQUEST): { |
3016 | struct rvt_ack_entry *e; |
3017 | u32 len; |
3018 | u8 next; |
3019 | |
3020 | if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) |
3021 | goto nack_inv; |
3022 | next = qp->r_head_ack_queue + 1; |
3023 | /* s_ack_queue is size rvt_size_atomic()+1 so use > not >= */ |
3024 | if (next > rvt_size_atomic(rdi: ib_to_rvt(ibdev: qp->ibqp.device))) |
3025 | next = 0; |
3026 | spin_lock_irqsave(&qp->s_lock, flags); |
3027 | if (unlikely(next == qp->s_acked_ack_queue)) { |
3028 | if (!qp->s_ack_queue[next].sent) |
3029 | goto nack_inv_unlck; |
3030 | update_ack_queue(qp, n: next); |
3031 | } |
3032 | e = &qp->s_ack_queue[qp->r_head_ack_queue]; |
3033 | release_rdma_sge_mr(e); |
3034 | reth = &ohdr->u.rc.reth; |
3035 | len = be32_to_cpu(reth->length); |
3036 | if (len) { |
3037 | u32 rkey = be32_to_cpu(reth->rkey); |
3038 | u64 vaddr = get_ib_reth_vaddr(reth); |
3039 | int ok; |
3040 | |
3041 | /* Check rkey & NAK */ |
3042 | ok = rvt_rkey_ok(qp, sge: &e->rdma_sge, len, vaddr, |
3043 | rkey, acc: IB_ACCESS_REMOTE_READ); |
3044 | if (unlikely(!ok)) |
3045 | goto nack_acc_unlck; |
3046 | /* |
3047 | * Update the next expected PSN. We add 1 later |
3048 | * below, so only add the remainder here. |
3049 | */ |
3050 | qp->r_psn += rvt_div_mtu(qp, len: len - 1); |
3051 | } else { |
3052 | e->rdma_sge.mr = NULL; |
3053 | e->rdma_sge.vaddr = NULL; |
3054 | e->rdma_sge.length = 0; |
3055 | e->rdma_sge.sge_length = 0; |
3056 | } |
3057 | e->opcode = opcode; |
3058 | e->sent = 0; |
3059 | e->psn = psn; |
3060 | e->lpsn = qp->r_psn; |
3061 | /* |
3062 | * We need to increment the MSN here instead of when we |
3063 | * finish sending the result since a duplicate request would |
3064 | * increment it more than once. |
3065 | */ |
3066 | qp->r_msn++; |
3067 | qp->r_psn++; |
3068 | qp->r_state = opcode; |
3069 | qp->r_nak_state = 0; |
3070 | qp->r_head_ack_queue = next; |
3071 | qpriv->r_tid_alloc = qp->r_head_ack_queue; |
3072 | |
3073 | /* Schedule the send engine. */ |
3074 | qp->s_flags |= RVT_S_RESP_PENDING; |
3075 | if (fecn) |
3076 | qp->s_flags |= RVT_S_ECN; |
3077 | hfi1_schedule_send(qp); |
3078 | |
3079 | spin_unlock_irqrestore(lock: &qp->s_lock, flags); |
3080 | return; |
3081 | } |
3082 | |
3083 | case OP(COMPARE_SWAP): |
3084 | case OP(FETCH_ADD): { |
3085 | struct ib_atomic_eth *ateth = &ohdr->u.atomic_eth; |
3086 | u64 vaddr = get_ib_ateth_vaddr(ateth); |
3087 | bool opfn = opcode == OP(COMPARE_SWAP) && |
3088 | vaddr == HFI1_VERBS_E_ATOMIC_VADDR; |
3089 | struct rvt_ack_entry *e; |
3090 | atomic64_t *maddr; |
3091 | u64 sdata; |
3092 | u32 rkey; |
3093 | u8 next; |
3094 | |
3095 | if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) && |
3096 | !opfn)) |
3097 | goto nack_inv; |
3098 | next = qp->r_head_ack_queue + 1; |
3099 | if (next > rvt_size_atomic(rdi: ib_to_rvt(ibdev: qp->ibqp.device))) |
3100 | next = 0; |
3101 | spin_lock_irqsave(&qp->s_lock, flags); |
3102 | if (unlikely(next == qp->s_acked_ack_queue)) { |
3103 | if (!qp->s_ack_queue[next].sent) |
3104 | goto nack_inv_unlck; |
3105 | update_ack_queue(qp, n: next); |
3106 | } |
3107 | e = &qp->s_ack_queue[qp->r_head_ack_queue]; |
3108 | release_rdma_sge_mr(e); |
3109 | /* Process OPFN special virtual address */ |
3110 | if (opfn) { |
3111 | opfn_conn_response(qp, e, ateth); |
3112 | goto ack; |
3113 | } |
3114 | if (unlikely(vaddr & (sizeof(u64) - 1))) |
3115 | goto nack_inv_unlck; |
3116 | rkey = be32_to_cpu(ateth->rkey); |
3117 | /* Check rkey & NAK */ |
3118 | if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), |
3119 | vaddr, rkey, |
3120 | IB_ACCESS_REMOTE_ATOMIC))) |
3121 | goto nack_acc_unlck; |
3122 | /* Perform atomic OP and save result. */ |
3123 | maddr = (atomic64_t *)qp->r_sge.sge.vaddr; |
3124 | sdata = get_ib_ateth_swap(ateth); |
3125 | e->atomic_data = (opcode == OP(FETCH_ADD)) ? |
3126 | (u64)atomic64_add_return(i: sdata, v: maddr) - sdata : |
3127 | (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr, |
3128 | get_ib_ateth_compare(ateth), |
3129 | sdata); |
3130 | rvt_put_mr(mr: qp->r_sge.sge.mr); |
3131 | qp->r_sge.num_sge = 0; |
3132 | ack: |
3133 | e->opcode = opcode; |
3134 | e->sent = 0; |
3135 | e->psn = psn; |
3136 | e->lpsn = psn; |
3137 | qp->r_msn++; |
3138 | qp->r_psn++; |
3139 | qp->r_state = opcode; |
3140 | qp->r_nak_state = 0; |
3141 | qp->r_head_ack_queue = next; |
3142 | qpriv->r_tid_alloc = qp->r_head_ack_queue; |
3143 | |
3144 | /* Schedule the send engine. */ |
3145 | qp->s_flags |= RVT_S_RESP_PENDING; |
3146 | if (fecn) |
3147 | qp->s_flags |= RVT_S_ECN; |
3148 | hfi1_schedule_send(qp); |
3149 | |
3150 | spin_unlock_irqrestore(lock: &qp->s_lock, flags); |
3151 | return; |
3152 | } |
3153 | |
3154 | default: |
3155 | /* NAK unknown opcodes. */ |
3156 | goto nack_inv; |
3157 | } |
3158 | qp->r_psn++; |
3159 | qp->r_state = opcode; |
3160 | qp->r_ack_psn = psn; |
3161 | qp->r_nak_state = 0; |
3162 | /* Send an ACK if requested or required. */ |
3163 | if (psn & IB_BTH_REQ_ACK || fecn) { |
3164 | if (packet->numpkt == 0 || fecn || |
3165 | qp->r_adefered >= HFI1_PSN_CREDIT) { |
3166 | rc_cancel_ack(qp); |
3167 | goto send_ack; |
3168 | } |
3169 | qp->r_adefered++; |
3170 | rc_defered_ack(rcd, qp); |
3171 | } |
3172 | return; |
3173 | |
3174 | rnr_nak: |
3175 | qp->r_nak_state = qp->r_min_rnr_timer | IB_RNR_NAK; |
3176 | qp->r_ack_psn = qp->r_psn; |
3177 | /* Queue RNR NAK for later */ |
3178 | rc_defered_ack(rcd, qp); |
3179 | return; |
3180 | |
3181 | nack_op_err: |
3182 | rvt_rc_error(qp, err: IB_WC_LOC_QP_OP_ERR); |
3183 | qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR; |
3184 | qp->r_ack_psn = qp->r_psn; |
3185 | /* Queue NAK for later */ |
3186 | rc_defered_ack(rcd, qp); |
3187 | return; |
3188 | |
3189 | nack_inv_unlck: |
3190 | spin_unlock_irqrestore(lock: &qp->s_lock, flags); |
3191 | nack_inv: |
3192 | rvt_rc_error(qp, err: IB_WC_LOC_QP_OP_ERR); |
3193 | qp->r_nak_state = IB_NAK_INVALID_REQUEST; |
3194 | qp->r_ack_psn = qp->r_psn; |
3195 | /* Queue NAK for later */ |
3196 | rc_defered_ack(rcd, qp); |
3197 | return; |
3198 | |
3199 | nack_acc_unlck: |
3200 | spin_unlock_irqrestore(lock: &qp->s_lock, flags); |
3201 | nack_acc: |
3202 | rvt_rc_error(qp, err: IB_WC_LOC_PROT_ERR); |
3203 | qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; |
3204 | qp->r_ack_psn = qp->r_psn; |
3205 | send_ack: |
3206 | hfi1_send_rc_ack(packet, is_fecn: fecn); |
3207 | } |
3208 | |
3209 | void hfi1_rc_hdrerr( |
3210 | struct hfi1_ctxtdata *rcd, |
3211 | struct hfi1_packet *packet, |
3212 | struct rvt_qp *qp) |
3213 | { |
3214 | struct hfi1_ibport *ibp = rcd_to_iport(rcd); |
3215 | int diff; |
3216 | u32 opcode; |
3217 | u32 psn; |
3218 | |
3219 | if (hfi1_ruc_check_hdr(ibp, packet)) |
3220 | return; |
3221 | |
3222 | psn = ib_bth_get_psn(ohdr: packet->ohdr); |
3223 | opcode = ib_bth_get_opcode(ohdr: packet->ohdr); |
3224 | |
3225 | /* Only deal with RDMA Writes for now */ |
3226 | if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) { |
3227 | diff = delta_psn(a: psn, b: qp->r_psn); |
3228 | if (!qp->r_nak_state && diff >= 0) { |
3229 | ibp->rvp.n_rc_seqnak++; |
3230 | qp->r_nak_state = IB_NAK_PSN_ERROR; |
3231 | /* Use the expected PSN. */ |
3232 | qp->r_ack_psn = qp->r_psn; |
3233 | /* |
3234 | * Wait to send the sequence |
3235 | * NAK until all packets |
3236 | * in the receive queue have |
3237 | * been processed. |
3238 | * Otherwise, we end up |
3239 | * propagating congestion. |
3240 | */ |
3241 | rc_defered_ack(rcd, qp); |
3242 | } /* Out of sequence NAK */ |
3243 | } /* QP Request NAKs */ |
3244 | } |
3245 | |