1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright (c) 2015, 2017 Oracle. All rights reserved. |
4 | * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. |
5 | */ |
6 | |
7 | /* Lightweight memory registration using Fast Registration Work |
8 | * Requests (FRWR). |
9 | * |
10 | * FRWR features ordered asynchronous registration and invalidation |
11 | * of arbitrarily-sized memory regions. This is the fastest and safest |
12 | * but most complex memory registration mode. |
13 | */ |
14 | |
15 | /* Normal operation |
16 | * |
17 | * A Memory Region is prepared for RDMA Read or Write using a FAST_REG |
18 | * Work Request (frwr_map). When the RDMA operation is finished, this |
19 | * Memory Region is invalidated using a LOCAL_INV Work Request |
20 | * (frwr_unmap_async and frwr_unmap_sync). |
21 | * |
22 | * Typically FAST_REG Work Requests are not signaled, and neither are |
23 | * RDMA Send Work Requests (with the exception of signaling occasionally |
24 | * to prevent provider work queue overflows). This greatly reduces HCA |
25 | * interrupt workload. |
26 | */ |
27 | |
28 | /* Transport recovery |
29 | * |
30 | * frwr_map and frwr_unmap_* cannot run at the same time the transport |
31 | * connect worker is running. The connect worker holds the transport |
32 | * send lock, just as ->send_request does. This prevents frwr_map and |
33 | * the connect worker from running concurrently. When a connection is |
34 | * closed, the Receive completion queue is drained before the allowing |
35 | * the connect worker to get control. This prevents frwr_unmap and the |
36 | * connect worker from running concurrently. |
37 | * |
38 | * When the underlying transport disconnects, MRs that are in flight |
39 | * are flushed and are likely unusable. Thus all MRs are destroyed. |
40 | * New MRs are created on demand. |
41 | */ |
42 | |
43 | #include <linux/sunrpc/svc_rdma.h> |
44 | |
45 | #include "xprt_rdma.h" |
46 | #include <trace/events/rpcrdma.h> |
47 | |
48 | static void frwr_cid_init(struct rpcrdma_ep *ep, |
49 | struct rpcrdma_mr *mr) |
50 | { |
51 | struct rpc_rdma_cid *cid = &mr->mr_cid; |
52 | |
53 | cid->ci_queue_id = ep->re_attr.send_cq->res.id; |
54 | cid->ci_completion_id = mr->mr_ibmr->res.id; |
55 | } |
56 | |
57 | static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) |
58 | { |
59 | if (mr->mr_device) { |
60 | trace_xprtrdma_mr_unmap(mr); |
61 | ib_dma_unmap_sg(dev: mr->mr_device, sg: mr->mr_sg, nents: mr->mr_nents, |
62 | direction: mr->mr_dir); |
63 | mr->mr_device = NULL; |
64 | } |
65 | } |
66 | |
67 | /** |
68 | * frwr_mr_release - Destroy one MR |
69 | * @mr: MR allocated by frwr_mr_init |
70 | * |
71 | */ |
72 | void frwr_mr_release(struct rpcrdma_mr *mr) |
73 | { |
74 | int rc; |
75 | |
76 | frwr_mr_unmap(r_xprt: mr->mr_xprt, mr); |
77 | |
78 | rc = ib_dereg_mr(mr: mr->mr_ibmr); |
79 | if (rc) |
80 | trace_xprtrdma_frwr_dereg(mr, rc); |
81 | kfree(objp: mr->mr_sg); |
82 | kfree(objp: mr); |
83 | } |
84 | |
85 | static void frwr_mr_put(struct rpcrdma_mr *mr) |
86 | { |
87 | frwr_mr_unmap(r_xprt: mr->mr_xprt, mr); |
88 | |
89 | /* The MR is returned to the req's MR free list instead |
90 | * of to the xprt's MR free list. No spinlock is needed. |
91 | */ |
92 | rpcrdma_mr_push(mr, list: &mr->mr_req->rl_free_mrs); |
93 | } |
94 | |
95 | /* frwr_reset - Place MRs back on the free list |
96 | * @req: request to reset |
97 | * |
98 | * Used after a failed marshal. For FRWR, this means the MRs |
99 | * don't have to be fully released and recreated. |
100 | * |
101 | * NB: This is safe only as long as none of @req's MRs are |
102 | * involved with an ongoing asynchronous FAST_REG or LOCAL_INV |
103 | * Work Request. |
104 | */ |
105 | void frwr_reset(struct rpcrdma_req *req) |
106 | { |
107 | struct rpcrdma_mr *mr; |
108 | |
109 | while ((mr = rpcrdma_mr_pop(list: &req->rl_registered))) |
110 | frwr_mr_put(mr); |
111 | } |
112 | |
113 | /** |
114 | * frwr_mr_init - Initialize one MR |
115 | * @r_xprt: controlling transport instance |
116 | * @mr: generic MR to prepare for FRWR |
117 | * |
118 | * Returns zero if successful. Otherwise a negative errno |
119 | * is returned. |
120 | */ |
121 | int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) |
122 | { |
123 | struct rpcrdma_ep *ep = r_xprt->rx_ep; |
124 | unsigned int depth = ep->re_max_fr_depth; |
125 | struct scatterlist *sg; |
126 | struct ib_mr *frmr; |
127 | |
128 | sg = kcalloc_node(n: depth, size: sizeof(*sg), XPRTRDMA_GFP_FLAGS, |
129 | node: ibdev_to_node(ibdev: ep->re_id->device)); |
130 | if (!sg) |
131 | return -ENOMEM; |
132 | |
133 | frmr = ib_alloc_mr(pd: ep->re_pd, mr_type: ep->re_mrtype, max_num_sg: depth); |
134 | if (IS_ERR(ptr: frmr)) |
135 | goto out_mr_err; |
136 | |
137 | mr->mr_xprt = r_xprt; |
138 | mr->mr_ibmr = frmr; |
139 | mr->mr_device = NULL; |
140 | INIT_LIST_HEAD(list: &mr->mr_list); |
141 | init_completion(x: &mr->mr_linv_done); |
142 | frwr_cid_init(ep, mr); |
143 | |
144 | sg_init_table(sg, depth); |
145 | mr->mr_sg = sg; |
146 | return 0; |
147 | |
148 | out_mr_err: |
149 | kfree(objp: sg); |
150 | trace_xprtrdma_frwr_alloc(mr, rc: PTR_ERR(ptr: frmr)); |
151 | return PTR_ERR(ptr: frmr); |
152 | } |
153 | |
154 | /** |
155 | * frwr_query_device - Prepare a transport for use with FRWR |
156 | * @ep: endpoint to fill in |
157 | * @device: RDMA device to query |
158 | * |
159 | * On success, sets: |
160 | * ep->re_attr |
161 | * ep->re_max_requests |
162 | * ep->re_max_rdma_segs |
163 | * ep->re_max_fr_depth |
164 | * ep->re_mrtype |
165 | * |
166 | * Return values: |
167 | * On success, returns zero. |
168 | * %-EINVAL - the device does not support FRWR memory registration |
169 | * %-ENOMEM - the device is not sufficiently capable for NFS/RDMA |
170 | */ |
171 | int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device) |
172 | { |
173 | const struct ib_device_attr *attrs = &device->attrs; |
174 | int max_qp_wr, depth, delta; |
175 | unsigned int max_sge; |
176 | |
177 | if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) || |
178 | attrs->max_fast_reg_page_list_len == 0) { |
179 | pr_err("rpcrdma: 'frwr' mode is not supported by device %s\n" , |
180 | device->name); |
181 | return -EINVAL; |
182 | } |
183 | |
184 | max_sge = min_t(unsigned int, attrs->max_send_sge, |
185 | RPCRDMA_MAX_SEND_SGES); |
186 | if (max_sge < RPCRDMA_MIN_SEND_SGES) { |
187 | pr_err("rpcrdma: HCA provides only %u send SGEs\n" , max_sge); |
188 | return -ENOMEM; |
189 | } |
190 | ep->re_attr.cap.max_send_sge = max_sge; |
191 | ep->re_attr.cap.max_recv_sge = 1; |
192 | |
193 | ep->re_mrtype = IB_MR_TYPE_MEM_REG; |
194 | if (attrs->kernel_cap_flags & IBK_SG_GAPS_REG) |
195 | ep->re_mrtype = IB_MR_TYPE_SG_GAPS; |
196 | |
197 | /* Quirk: Some devices advertise a large max_fast_reg_page_list_len |
198 | * capability, but perform optimally when the MRs are not larger |
199 | * than a page. |
200 | */ |
201 | if (attrs->max_sge_rd > RPCRDMA_MAX_HDR_SEGS) |
202 | ep->re_max_fr_depth = attrs->max_sge_rd; |
203 | else |
204 | ep->re_max_fr_depth = attrs->max_fast_reg_page_list_len; |
205 | if (ep->re_max_fr_depth > RPCRDMA_MAX_DATA_SEGS) |
206 | ep->re_max_fr_depth = RPCRDMA_MAX_DATA_SEGS; |
207 | |
208 | /* Add room for frwr register and invalidate WRs. |
209 | * 1. FRWR reg WR for head |
210 | * 2. FRWR invalidate WR for head |
211 | * 3. N FRWR reg WRs for pagelist |
212 | * 4. N FRWR invalidate WRs for pagelist |
213 | * 5. FRWR reg WR for tail |
214 | * 6. FRWR invalidate WR for tail |
215 | * 7. The RDMA_SEND WR |
216 | */ |
217 | depth = 7; |
218 | |
219 | /* Calculate N if the device max FRWR depth is smaller than |
220 | * RPCRDMA_MAX_DATA_SEGS. |
221 | */ |
222 | if (ep->re_max_fr_depth < RPCRDMA_MAX_DATA_SEGS) { |
223 | delta = RPCRDMA_MAX_DATA_SEGS - ep->re_max_fr_depth; |
224 | do { |
225 | depth += 2; /* FRWR reg + invalidate */ |
226 | delta -= ep->re_max_fr_depth; |
227 | } while (delta > 0); |
228 | } |
229 | |
230 | max_qp_wr = attrs->max_qp_wr; |
231 | max_qp_wr -= RPCRDMA_BACKWARD_WRS; |
232 | max_qp_wr -= 1; |
233 | if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE) |
234 | return -ENOMEM; |
235 | if (ep->re_max_requests > max_qp_wr) |
236 | ep->re_max_requests = max_qp_wr; |
237 | ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth; |
238 | if (ep->re_attr.cap.max_send_wr > max_qp_wr) { |
239 | ep->re_max_requests = max_qp_wr / depth; |
240 | if (!ep->re_max_requests) |
241 | return -ENOMEM; |
242 | ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth; |
243 | } |
244 | ep->re_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; |
245 | ep->re_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ |
246 | ep->re_attr.cap.max_recv_wr = ep->re_max_requests; |
247 | ep->re_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; |
248 | ep->re_attr.cap.max_recv_wr += RPCRDMA_MAX_RECV_BATCH; |
249 | ep->re_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ |
250 | |
251 | ep->re_max_rdma_segs = |
252 | DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ep->re_max_fr_depth); |
253 | /* Reply chunks require segments for head and tail buffers */ |
254 | ep->re_max_rdma_segs += 2; |
255 | if (ep->re_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS) |
256 | ep->re_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS; |
257 | |
258 | /* Ensure the underlying device is capable of conveying the |
259 | * largest r/wsize NFS will ask for. This guarantees that |
260 | * failing over from one RDMA device to another will not |
261 | * break NFS I/O. |
262 | */ |
263 | if ((ep->re_max_rdma_segs * ep->re_max_fr_depth) < RPCRDMA_MAX_SEGS) |
264 | return -ENOMEM; |
265 | |
266 | return 0; |
267 | } |
268 | |
269 | /** |
270 | * frwr_map - Register a memory region |
271 | * @r_xprt: controlling transport |
272 | * @seg: memory region co-ordinates |
273 | * @nsegs: number of segments remaining |
274 | * @writing: true when RDMA Write will be used |
275 | * @xid: XID of RPC using the registered memory |
276 | * @mr: MR to fill in |
277 | * |
278 | * Prepare a REG_MR Work Request to register a memory region |
279 | * for remote access via RDMA READ or RDMA WRITE. |
280 | * |
281 | * Returns the next segment or a negative errno pointer. |
282 | * On success, @mr is filled in. |
283 | */ |
284 | struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, |
285 | struct rpcrdma_mr_seg *seg, |
286 | int nsegs, bool writing, __be32 xid, |
287 | struct rpcrdma_mr *mr) |
288 | { |
289 | struct rpcrdma_ep *ep = r_xprt->rx_ep; |
290 | struct ib_reg_wr *reg_wr; |
291 | int i, n, dma_nents; |
292 | struct ib_mr *ibmr; |
293 | u8 key; |
294 | |
295 | if (nsegs > ep->re_max_fr_depth) |
296 | nsegs = ep->re_max_fr_depth; |
297 | for (i = 0; i < nsegs;) { |
298 | sg_set_page(sg: &mr->mr_sg[i], page: seg->mr_page, |
299 | len: seg->mr_len, offset: seg->mr_offset); |
300 | |
301 | ++seg; |
302 | ++i; |
303 | if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS) |
304 | continue; |
305 | if ((i < nsegs && seg->mr_offset) || |
306 | offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) |
307 | break; |
308 | } |
309 | mr->mr_dir = rpcrdma_data_dir(writing); |
310 | mr->mr_nents = i; |
311 | |
312 | dma_nents = ib_dma_map_sg(dev: ep->re_id->device, sg: mr->mr_sg, nents: mr->mr_nents, |
313 | direction: mr->mr_dir); |
314 | if (!dma_nents) |
315 | goto out_dmamap_err; |
316 | mr->mr_device = ep->re_id->device; |
317 | |
318 | ibmr = mr->mr_ibmr; |
319 | n = ib_map_mr_sg(mr: ibmr, sg: mr->mr_sg, sg_nents: dma_nents, NULL, PAGE_SIZE); |
320 | if (n != dma_nents) |
321 | goto out_mapmr_err; |
322 | |
323 | ibmr->iova &= 0x00000000ffffffff; |
324 | ibmr->iova |= ((u64)be32_to_cpu(xid)) << 32; |
325 | key = (u8)(ibmr->rkey & 0x000000FF); |
326 | ib_update_fast_reg_key(mr: ibmr, newkey: ++key); |
327 | |
328 | reg_wr = &mr->mr_regwr; |
329 | reg_wr->mr = ibmr; |
330 | reg_wr->key = ibmr->rkey; |
331 | reg_wr->access = writing ? |
332 | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : |
333 | IB_ACCESS_REMOTE_READ; |
334 | |
335 | mr->mr_handle = ibmr->rkey; |
336 | mr->mr_length = ibmr->length; |
337 | mr->mr_offset = ibmr->iova; |
338 | trace_xprtrdma_mr_map(mr); |
339 | |
340 | return seg; |
341 | |
342 | out_dmamap_err: |
343 | trace_xprtrdma_frwr_sgerr(mr, sg_nents: i); |
344 | return ERR_PTR(error: -EIO); |
345 | |
346 | out_mapmr_err: |
347 | trace_xprtrdma_frwr_maperr(mr, num_mapped: n); |
348 | return ERR_PTR(error: -EIO); |
349 | } |
350 | |
351 | /** |
352 | * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC |
353 | * @cq: completion queue |
354 | * @wc: WCE for a completed FastReg WR |
355 | * |
356 | * Each flushed MR gets destroyed after the QP has drained. |
357 | */ |
358 | static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) |
359 | { |
360 | struct ib_cqe *cqe = wc->wr_cqe; |
361 | struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); |
362 | |
363 | /* WARNING: Only wr_cqe and status are reliable at this point */ |
364 | trace_xprtrdma_wc_fastreg(wc, cid: &mr->mr_cid); |
365 | |
366 | rpcrdma_flush_disconnect(r_xprt: cq->cq_context, wc); |
367 | } |
368 | |
369 | /** |
370 | * frwr_send - post Send WRs containing the RPC Call message |
371 | * @r_xprt: controlling transport instance |
372 | * @req: prepared RPC Call |
373 | * |
374 | * For FRWR, chain any FastReg WRs to the Send WR. Only a |
375 | * single ib_post_send call is needed to register memory |
376 | * and then post the Send WR. |
377 | * |
378 | * Returns the return code from ib_post_send. |
379 | * |
380 | * Caller must hold the transport send lock to ensure that the |
381 | * pointers to the transport's rdma_cm_id and QP are stable. |
382 | */ |
383 | int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) |
384 | { |
385 | struct ib_send_wr *post_wr, *send_wr = &req->rl_wr; |
386 | struct rpcrdma_ep *ep = r_xprt->rx_ep; |
387 | struct rpcrdma_mr *mr; |
388 | unsigned int num_wrs; |
389 | int ret; |
390 | |
391 | num_wrs = 1; |
392 | post_wr = send_wr; |
393 | list_for_each_entry(mr, &req->rl_registered, mr_list) { |
394 | trace_xprtrdma_mr_fastreg(mr); |
395 | |
396 | mr->mr_cqe.done = frwr_wc_fastreg; |
397 | mr->mr_regwr.wr.next = post_wr; |
398 | mr->mr_regwr.wr.wr_cqe = &mr->mr_cqe; |
399 | mr->mr_regwr.wr.num_sge = 0; |
400 | mr->mr_regwr.wr.opcode = IB_WR_REG_MR; |
401 | mr->mr_regwr.wr.send_flags = 0; |
402 | post_wr = &mr->mr_regwr.wr; |
403 | ++num_wrs; |
404 | } |
405 | |
406 | if ((kref_read(kref: &req->rl_kref) > 1) || num_wrs > ep->re_send_count) { |
407 | send_wr->send_flags |= IB_SEND_SIGNALED; |
408 | ep->re_send_count = min_t(unsigned int, ep->re_send_batch, |
409 | num_wrs - ep->re_send_count); |
410 | } else { |
411 | send_wr->send_flags &= ~IB_SEND_SIGNALED; |
412 | ep->re_send_count -= num_wrs; |
413 | } |
414 | |
415 | trace_xprtrdma_post_send(req); |
416 | ret = ib_post_send(qp: ep->re_id->qp, send_wr: post_wr, NULL); |
417 | if (ret) |
418 | trace_xprtrdma_post_send_err(r_xprt, req, rc: ret); |
419 | return ret; |
420 | } |
421 | |
422 | /** |
423 | * frwr_reminv - handle a remotely invalidated mr on the @mrs list |
424 | * @rep: Received reply |
425 | * @mrs: list of MRs to check |
426 | * |
427 | */ |
428 | void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) |
429 | { |
430 | struct rpcrdma_mr *mr; |
431 | |
432 | list_for_each_entry(mr, mrs, mr_list) |
433 | if (mr->mr_handle == rep->rr_inv_rkey) { |
434 | list_del_init(entry: &mr->mr_list); |
435 | trace_xprtrdma_mr_reminv(mr); |
436 | frwr_mr_put(mr); |
437 | break; /* only one invalidated MR per RPC */ |
438 | } |
439 | } |
440 | |
441 | static void frwr_mr_done(struct ib_wc *wc, struct rpcrdma_mr *mr) |
442 | { |
443 | if (likely(wc->status == IB_WC_SUCCESS)) |
444 | frwr_mr_put(mr); |
445 | } |
446 | |
447 | /** |
448 | * frwr_wc_localinv - Invoked by RDMA provider for a LOCAL_INV WC |
449 | * @cq: completion queue |
450 | * @wc: WCE for a completed LocalInv WR |
451 | * |
452 | */ |
453 | static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) |
454 | { |
455 | struct ib_cqe *cqe = wc->wr_cqe; |
456 | struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); |
457 | |
458 | /* WARNING: Only wr_cqe and status are reliable at this point */ |
459 | trace_xprtrdma_wc_li(wc, cid: &mr->mr_cid); |
460 | frwr_mr_done(wc, mr); |
461 | |
462 | rpcrdma_flush_disconnect(r_xprt: cq->cq_context, wc); |
463 | } |
464 | |
465 | /** |
466 | * frwr_wc_localinv_wake - Invoked by RDMA provider for a LOCAL_INV WC |
467 | * @cq: completion queue |
468 | * @wc: WCE for a completed LocalInv WR |
469 | * |
470 | * Awaken anyone waiting for an MR to finish being fenced. |
471 | */ |
472 | static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) |
473 | { |
474 | struct ib_cqe *cqe = wc->wr_cqe; |
475 | struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); |
476 | |
477 | /* WARNING: Only wr_cqe and status are reliable at this point */ |
478 | trace_xprtrdma_wc_li_wake(wc, cid: &mr->mr_cid); |
479 | frwr_mr_done(wc, mr); |
480 | complete(&mr->mr_linv_done); |
481 | |
482 | rpcrdma_flush_disconnect(r_xprt: cq->cq_context, wc); |
483 | } |
484 | |
485 | /** |
486 | * frwr_unmap_sync - invalidate memory regions that were registered for @req |
487 | * @r_xprt: controlling transport instance |
488 | * @req: rpcrdma_req with a non-empty list of MRs to process |
489 | * |
490 | * Sleeps until it is safe for the host CPU to access the previously mapped |
491 | * memory regions. This guarantees that registered MRs are properly fenced |
492 | * from the server before the RPC consumer accesses the data in them. It |
493 | * also ensures proper Send flow control: waking the next RPC waits until |
494 | * this RPC has relinquished all its Send Queue entries. |
495 | */ |
496 | void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) |
497 | { |
498 | struct ib_send_wr *first, **prev, *last; |
499 | struct rpcrdma_ep *ep = r_xprt->rx_ep; |
500 | const struct ib_send_wr *bad_wr; |
501 | struct rpcrdma_mr *mr; |
502 | int rc; |
503 | |
504 | /* ORDER: Invalidate all of the MRs first |
505 | * |
506 | * Chain the LOCAL_INV Work Requests and post them with |
507 | * a single ib_post_send() call. |
508 | */ |
509 | prev = &first; |
510 | mr = rpcrdma_mr_pop(list: &req->rl_registered); |
511 | do { |
512 | trace_xprtrdma_mr_localinv(mr); |
513 | r_xprt->rx_stats.local_inv_needed++; |
514 | |
515 | last = &mr->mr_invwr; |
516 | last->next = NULL; |
517 | last->wr_cqe = &mr->mr_cqe; |
518 | last->sg_list = NULL; |
519 | last->num_sge = 0; |
520 | last->opcode = IB_WR_LOCAL_INV; |
521 | last->send_flags = IB_SEND_SIGNALED; |
522 | last->ex.invalidate_rkey = mr->mr_handle; |
523 | |
524 | last->wr_cqe->done = frwr_wc_localinv; |
525 | |
526 | *prev = last; |
527 | prev = &last->next; |
528 | } while ((mr = rpcrdma_mr_pop(list: &req->rl_registered))); |
529 | |
530 | mr = container_of(last, struct rpcrdma_mr, mr_invwr); |
531 | |
532 | /* Strong send queue ordering guarantees that when the |
533 | * last WR in the chain completes, all WRs in the chain |
534 | * are complete. |
535 | */ |
536 | last->wr_cqe->done = frwr_wc_localinv_wake; |
537 | reinit_completion(x: &mr->mr_linv_done); |
538 | |
539 | /* Transport disconnect drains the receive CQ before it |
540 | * replaces the QP. The RPC reply handler won't call us |
541 | * unless re_id->qp is a valid pointer. |
542 | */ |
543 | bad_wr = NULL; |
544 | rc = ib_post_send(qp: ep->re_id->qp, send_wr: first, bad_send_wr: &bad_wr); |
545 | |
546 | /* The final LOCAL_INV WR in the chain is supposed to |
547 | * do the wake. If it was never posted, the wake will |
548 | * not happen, so don't wait in that case. |
549 | */ |
550 | if (bad_wr != first) |
551 | wait_for_completion(&mr->mr_linv_done); |
552 | if (!rc) |
553 | return; |
554 | |
555 | /* On error, the MRs get destroyed once the QP has drained. */ |
556 | trace_xprtrdma_post_linv_err(req, status: rc); |
557 | |
558 | /* Force a connection loss to ensure complete recovery. |
559 | */ |
560 | rpcrdma_force_disconnect(ep); |
561 | } |
562 | |
563 | /** |
564 | * frwr_wc_localinv_done - Invoked by RDMA provider for a signaled LOCAL_INV WC |
565 | * @cq: completion queue |
566 | * @wc: WCE for a completed LocalInv WR |
567 | * |
568 | */ |
569 | static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) |
570 | { |
571 | struct ib_cqe *cqe = wc->wr_cqe; |
572 | struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); |
573 | struct rpcrdma_rep *rep; |
574 | |
575 | /* WARNING: Only wr_cqe and status are reliable at this point */ |
576 | trace_xprtrdma_wc_li_done(wc, cid: &mr->mr_cid); |
577 | |
578 | /* Ensure that @rep is generated before the MR is released */ |
579 | rep = mr->mr_req->rl_reply; |
580 | smp_rmb(); |
581 | |
582 | if (wc->status != IB_WC_SUCCESS) { |
583 | if (rep) |
584 | rpcrdma_unpin_rqst(rep); |
585 | rpcrdma_flush_disconnect(r_xprt: cq->cq_context, wc); |
586 | return; |
587 | } |
588 | frwr_mr_put(mr); |
589 | rpcrdma_complete_rqst(rep); |
590 | } |
591 | |
592 | /** |
593 | * frwr_unmap_async - invalidate memory regions that were registered for @req |
594 | * @r_xprt: controlling transport instance |
595 | * @req: rpcrdma_req with a non-empty list of MRs to process |
596 | * |
597 | * This guarantees that registered MRs are properly fenced from the |
598 | * server before the RPC consumer accesses the data in them. It also |
599 | * ensures proper Send flow control: waking the next RPC waits until |
600 | * this RPC has relinquished all its Send Queue entries. |
601 | */ |
602 | void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) |
603 | { |
604 | struct ib_send_wr *first, *last, **prev; |
605 | struct rpcrdma_ep *ep = r_xprt->rx_ep; |
606 | struct rpcrdma_mr *mr; |
607 | int rc; |
608 | |
609 | /* Chain the LOCAL_INV Work Requests and post them with |
610 | * a single ib_post_send() call. |
611 | */ |
612 | prev = &first; |
613 | mr = rpcrdma_mr_pop(list: &req->rl_registered); |
614 | do { |
615 | trace_xprtrdma_mr_localinv(mr); |
616 | r_xprt->rx_stats.local_inv_needed++; |
617 | |
618 | last = &mr->mr_invwr; |
619 | last->next = NULL; |
620 | last->wr_cqe = &mr->mr_cqe; |
621 | last->sg_list = NULL; |
622 | last->num_sge = 0; |
623 | last->opcode = IB_WR_LOCAL_INV; |
624 | last->send_flags = IB_SEND_SIGNALED; |
625 | last->ex.invalidate_rkey = mr->mr_handle; |
626 | |
627 | last->wr_cqe->done = frwr_wc_localinv; |
628 | |
629 | *prev = last; |
630 | prev = &last->next; |
631 | } while ((mr = rpcrdma_mr_pop(list: &req->rl_registered))); |
632 | |
633 | /* Strong send queue ordering guarantees that when the |
634 | * last WR in the chain completes, all WRs in the chain |
635 | * are complete. The last completion will wake up the |
636 | * RPC waiter. |
637 | */ |
638 | last->wr_cqe->done = frwr_wc_localinv_done; |
639 | |
640 | /* Transport disconnect drains the receive CQ before it |
641 | * replaces the QP. The RPC reply handler won't call us |
642 | * unless re_id->qp is a valid pointer. |
643 | */ |
644 | rc = ib_post_send(qp: ep->re_id->qp, send_wr: first, NULL); |
645 | if (!rc) |
646 | return; |
647 | |
648 | /* On error, the MRs get destroyed once the QP has drained. */ |
649 | trace_xprtrdma_post_linv_err(req, status: rc); |
650 | |
651 | /* The final LOCAL_INV WR in the chain is supposed to |
652 | * do the wake. If it was never posted, the wake does |
653 | * not happen. Unpin the rqst in preparation for its |
654 | * retransmission. |
655 | */ |
656 | rpcrdma_unpin_rqst(rep: req->rl_reply); |
657 | |
658 | /* Force a connection loss to ensure complete recovery. |
659 | */ |
660 | rpcrdma_force_disconnect(ep); |
661 | } |
662 | |
663 | /** |
664 | * frwr_wp_create - Create an MR for padding Write chunks |
665 | * @r_xprt: transport resources to use |
666 | * |
667 | * Return 0 on success, negative errno on failure. |
668 | */ |
669 | int frwr_wp_create(struct rpcrdma_xprt *r_xprt) |
670 | { |
671 | struct rpcrdma_ep *ep = r_xprt->rx_ep; |
672 | struct rpcrdma_mr_seg seg; |
673 | struct rpcrdma_mr *mr; |
674 | |
675 | mr = rpcrdma_mr_get(r_xprt); |
676 | if (!mr) |
677 | return -EAGAIN; |
678 | mr->mr_req = NULL; |
679 | ep->re_write_pad_mr = mr; |
680 | |
681 | seg.mr_len = XDR_UNIT; |
682 | seg.mr_page = virt_to_page(ep->re_write_pad); |
683 | seg.mr_offset = offset_in_page(ep->re_write_pad); |
684 | if (IS_ERR(ptr: frwr_map(r_xprt, seg: &seg, nsegs: 1, writing: true, xdr_zero, mr))) |
685 | return -EIO; |
686 | trace_xprtrdma_mr_fastreg(mr); |
687 | |
688 | mr->mr_cqe.done = frwr_wc_fastreg; |
689 | mr->mr_regwr.wr.next = NULL; |
690 | mr->mr_regwr.wr.wr_cqe = &mr->mr_cqe; |
691 | mr->mr_regwr.wr.num_sge = 0; |
692 | mr->mr_regwr.wr.opcode = IB_WR_REG_MR; |
693 | mr->mr_regwr.wr.send_flags = 0; |
694 | |
695 | return ib_post_send(qp: ep->re_id->qp, send_wr: &mr->mr_regwr.wr, NULL); |
696 | } |
697 | |