rpc_rdma.c source code [linux/net/sunrpc/xprtrdma/rpc_rdma.c]

1	// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2	/*
3	* Copyright (c) 2014-2020, Oracle and/or its affiliates.
4	* Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
5	*
6	* This software is available to you under a choice of one of two
7	* licenses. You may choose to be licensed under the terms of the GNU
8	* General Public License (GPL) Version 2, available from the file
9	* COPYING in the main directory of this source tree, or the BSD-type
10	* license below:
11	*
12	* Redistribution and use in source and binary forms, with or without
13	* modification, are permitted provided that the following conditions
14	* are met:
15	*
16	* Redistributions of source code must retain the above copyright
17	* notice, this list of conditions and the following disclaimer.
18	*
19	* Redistributions in binary form must reproduce the above
20	* copyright notice, this list of conditions and the following
21	* disclaimer in the documentation and/or other materials provided
22	* with the distribution.
23	*
24	* Neither the name of the Network Appliance, Inc. nor the names of
25	* its contributors may be used to endorse or promote products
26	* derived from this software without specific prior written
27	* permission.
28	*
29	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
30	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
31	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
32	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
33	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
34	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
35	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
36	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
37	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
38	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
39	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40	*/
41
42	/*
43	* rpc_rdma.c
44	*
45	* This file contains the guts of the RPC RDMA protocol, and
46	* does marshaling/unmarshaling, etc. It is also where interfacing
47	* to the Linux RPC framework lives.
48	*/
49
50	#include <linux/highmem.h>
51
52	#include <linux/sunrpc/svc_rdma.h>
53
54	#include "xprt_rdma.h"
55	#include <trace/events/rpcrdma.h>
56
57	/ Returns size of largest RPC-over-RDMA header in a Call message*
58	*
59	* The largest Call header contains a full-size Read list and a
60	* minimal Reply chunk.
61	*/
62	static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
63	{
64	unsigned int size;
65
66	/ Fixed header fields and list discriminators /
67	size = RPCRDMA_HDRLEN_MIN;
68
69	/ Maximum Read list size /
70	size += maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32);
71
72	/ Minimal Read chunk size /
73	size += sizeof(__be32); / segment count /
74	size += rpcrdma_segment_maxsz * sizeof(__be32);
75	size += sizeof(__be32); / list discriminator /
76
77	return size;
78	}
79
80	/ Returns size of largest RPC-over-RDMA header in a Reply message*
81	*
82	* There is only one Write list or one Reply chunk per Reply
83	* message. The larger list is the Write list.
84	*/
85	static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
86	{
87	unsigned int size;
88
89	/ Fixed header fields and list discriminators /
90	size = RPCRDMA_HDRLEN_MIN;
91
92	/ Maximum Write list size /
93	size += sizeof(__be32); / segment count /
94	size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32);
95	size += sizeof(__be32); / list discriminator /
96
97	return size;
98	}
99
100	/**
101	* rpcrdma_set_max_header_sizes - Initialize inline payload sizes
102	* @ep: endpoint to initialize
103	*
104	* The max_inline fields contain the maximum size of an RPC message
105	* so the marshaling code doesn't have to repeat this calculation
106	* for every RPC.
107	*/
108	void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep)
109	{
110	unsigned int maxsegs = ep->re_max_rdma_segs;
111
112	ep->re_max_inline_send =
113	ep->re_inline_send - rpcrdma_max_call_header_size(maxsegs);
114	ep->re_max_inline_recv =
115	ep->re_inline_recv - rpcrdma_max_reply_header_size(maxsegs);
116	}
117
118	/ The client can send a request inline as long as the RPCRDMA header*
119	* plus the RPC call fit under the transport's inline limit. If the
120	* combined call message size exceeds that limit, the client must use
121	* a Read chunk for this operation.
122	*
123	* A Read chunk is also required if sending the RPC call inline would
124	* exceed this device's max_sge limit.
125	*/
126	static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
127	struct rpc_rqst *rqst)
128	{
129	struct xdr_buf *xdr = &rqst->rq_snd_buf;
130	struct rpcrdma_ep *ep = r_xprt->rx_ep;
131	unsigned int count, remaining, offset;
132
133	if (xdr->len > ep->re_max_inline_send)
134	return false;
135
136	if (xdr->page_len) {
137	remaining = xdr->page_len;
138	offset = offset_in_page(xdr->page_base);
139	count = RPCRDMA_MIN_SEND_SGES;
140	while (remaining) {
141	remaining -= min_t(unsigned int,
142	PAGE_SIZE - offset, remaining);
143	offset = `0`;
144	if (++count > ep->re_attr.cap.max_send_sge)
145	return false;
146	}
147	}
148
149	return true;
150	}
151
152	/ The client can't know how large the actual reply will be. Thus it*
153	* plans for the largest possible reply for that particular ULP
154	* operation. If the maximum combined reply message size exceeds that
155	* limit, the client must provide a write list or a reply chunk for
156	* this request.
157	*/
158	static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
159	struct rpc_rqst *rqst)
160	{
161	return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep->re_max_inline_recv;
162	}
163
164	/ The client is required to provide a Reply chunk if the maximum*
165	* size of the non-payload part of the RPC Reply is larger than
166	* the inline threshold.
167	*/
168	static bool
169	rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt,
170	const struct rpc_rqst *rqst)
171	{
172	const struct xdr_buf *buf = &rqst->rq_rcv_buf;
173
174	return (buf->head[`0`].iov_len + buf->tail[`0`].iov_len) <
175	r_xprt->rx_ep->re_max_inline_recv;
176	}
177
178	/ ACL likes to be lazy in allocating pages. For TCP, these*
179	* pages can be allocated during receive processing. Not true
180	* for RDMA, which must always provision receive buffers
181	* up front.
182	*/
183	static noinline int
184	rpcrdma_alloc_sparse_pages(struct xdr_buf *buf)
185	{
186	struct page **ppages;
187	int len;
188
189	len = buf->page_len;
190	ppages = buf->pages + (buf->page_base >> PAGE_SHIFT);
191	while (len > `0`) {
192	if (!*ppages)
193	*ppages = alloc_page(GFP_NOWAIT \| __GFP_NOWARN);
194	if (!*ppages)
195	return -ENOBUFS;
196	ppages++;
197	len -= PAGE_SIZE;
198	}
199
200	return `0`;
201	}
202
203	/ Convert @vec to a single SGL element.*
204	*
205	* Returns pointer to next available SGE, and bumps the total number
206	* of SGEs consumed.
207	*/
208	static struct rpcrdma_mr_seg *
209	rpcrdma_convert_kvec(struct kvec vec, struct* rpcrdma_mr_seg *seg,
210	unsigned int *n)
211	{
212	seg->mr_page = virt_to_page(vec->iov_base);
213	seg->mr_offset = offset_in_page(vec->iov_base);
214	seg->mr_len = vec->iov_len;
215	++seg;
216	++(*n);
217	return seg;
218	}
219
220	/ Convert @xdrbuf into SGEs no larger than a page each. As they*
221	* are registered, these SGEs are then coalesced into RDMA segments
222	* when the selected memreg mode supports it.
223	*
224	* Returns positive number of SGEs consumed, or a negative errno.
225	*/
226
227	static int
228	rpcrdma_convert_iovs(struct rpcrdma_xprt r_xprt, struct* xdr_buf *xdrbuf,
229	unsigned int pos, enum rpcrdma_chunktype type,
230	struct rpcrdma_mr_seg *seg)
231	{
232	unsigned long page_base;
233	unsigned int len, n;
234	struct page **ppages;
235
236	n = `0`;
237	if (pos == `0`)
238	seg = rpcrdma_convert_kvec(vec: &xdrbuf->head[`0`], seg, n: &n);
239
240	len = xdrbuf->page_len;
241	ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
242	page_base = offset_in_page(xdrbuf->page_base);
243	while (len) {
244	seg->mr_page = *ppages;
245	seg->mr_offset = page_base;
246	seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len);
247	len -= seg->mr_len;
248	++ppages;
249	++seg;
250	++n;
251	page_base = `0`;
252	}
253
254	if (type == rpcrdma_readch \|\| type == rpcrdma_writech)
255	goto out;
256
257	if (xdrbuf->tail[`0`].iov_len)
258	rpcrdma_convert_kvec(vec: &xdrbuf->tail[`0`], seg, n: &n);
259
260	out:
261	if (unlikely(n > RPCRDMA_MAX_SEGS))
262	return -EIO;
263	return n;
264	}
265
266	static int
267	encode_rdma_segment(struct xdr_stream xdr, struct* rpcrdma_mr *mr)
268	{
269	__be32 *p;
270
271	p = xdr_reserve_space(xdr, nbytes: `4` * sizeof(*p));
272	if (unlikely(!p))
273	return -EMSGSIZE;
274
275	xdr_encode_rdma_segment(p, handle: mr->mr_handle, length: mr->mr_length, offset: mr->mr_offset);
276	return `0`;
277	}
278
279	static int
280	encode_read_segment(struct xdr_stream xdr, struct* rpcrdma_mr *mr,
281	u32 position)
282	{
283	__be32 *p;
284
285	p = xdr_reserve_space(xdr, nbytes: `6` * sizeof(*p));
286	if (unlikely(!p))
287	return -EMSGSIZE;
288
289	p++ = xdr_one; /* Item present /
290	xdr_encode_read_segment(p, position, handle: mr->mr_handle, length: mr->mr_length,
291	offset: mr->mr_offset);
292	return `0`;
293	}
294
295	static struct rpcrdma_mr_seg rpcrdma_mr_prepare(struct* rpcrdma_xprt *r_xprt,
296	struct rpcrdma_req *req,
297	struct rpcrdma_mr_seg *seg,
298	int nsegs, bool writing,
299	struct rpcrdma_mr **mr)
300	{
301	*mr = rpcrdma_mr_pop(list: &req->rl_free_mrs);
302	if (!*mr) {
303	*mr = rpcrdma_mr_get(r_xprt);
304	if (!*mr)
305	goto out_getmr_err;
306	(*mr)->mr_req = req;
307	}
308
309	rpcrdma_mr_push(mr: *mr, list: &req->rl_registered);
310	return frwr_map(r_xprt, seg, nsegs, writing, xid: req->rl_slot.rq_xid, mr: *mr);
311
312	out_getmr_err:
313	trace_xprtrdma_nomrs_err(r_xprt, req);
314	xprt_wait_for_buffer_space(xprt: &r_xprt->rx_xprt);
315	rpcrdma_mrs_refresh(r_xprt);
316	return ERR_PTR(error: -EAGAIN);
317	}
318
319	/ Register and XDR encode the Read list. Supports encoding a list of read*
320	* segments that belong to a single read chunk.
321	*
322	* Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
323	*
324	* Read chunklist (a linked list):
325	* N elements, position P (same P for all chunks of same arg!):
326	* 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
327	*
328	* Returns zero on success, or a negative errno if a failure occurred.
329	* @xdr is advanced to the next position in the stream.
330	*
331	* Only a single @pos value is currently supported.
332	*/
333	static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
334	struct rpcrdma_req *req,
335	struct rpc_rqst *rqst,
336	enum rpcrdma_chunktype rtype)
337	{
338	struct xdr_stream *xdr = &req->rl_stream;
339	struct rpcrdma_mr_seg *seg;
340	struct rpcrdma_mr *mr;
341	unsigned int pos;
342	int nsegs;
343
344	if (rtype == rpcrdma_noch_pullup \|\| rtype == rpcrdma_noch_mapped)
345	goto done;
346
347	pos = rqst->rq_snd_buf.head[`0`].iov_len;
348	if (rtype == rpcrdma_areadch)
349	pos = `0`;
350	seg = req->rl_segments;
351	nsegs = rpcrdma_convert_iovs(r_xprt, xdrbuf: &rqst->rq_snd_buf, pos,
352	type: rtype, seg);
353	if (nsegs < `0`)
354	return nsegs;
355
356	do {
357	seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, writing: false, mr: &mr);
358	if (IS_ERR(ptr: seg))
359	return PTR_ERR(ptr: seg);
360
361	if (encode_read_segment(xdr, mr, position: pos) < `0`)
362	return -EMSGSIZE;
363
364	trace_xprtrdma_chunk_read(task: rqst->rq_task, pos, mr, nsegs);
365	r_xprt->rx_stats.read_chunk_count++;
366	nsegs -= mr->mr_nents;
367	} while (nsegs);
368
369	done:
370	if (xdr_stream_encode_item_absent(xdr) < `0`)
371	return -EMSGSIZE;
372	return `0`;
373	}
374
375	/ Register and XDR encode the Write list. Supports encoding a list*
376	* containing one array of plain segments that belong to a single
377	* write chunk.
378	*
379	* Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
380	*
381	* Write chunklist (a list of (one) counted array):
382	* N elements:
383	* 1 - N - HLOO - HLOO - ... - HLOO - 0
384	*
385	* Returns zero on success, or a negative errno if a failure occurred.
386	* @xdr is advanced to the next position in the stream.
387	*
388	* Only a single Write chunk is currently supported.
389	*/
390	static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt,
391	struct rpcrdma_req *req,
392	struct rpc_rqst *rqst,
393	enum rpcrdma_chunktype wtype)
394	{
395	struct xdr_stream *xdr = &req->rl_stream;
396	struct rpcrdma_ep *ep = r_xprt->rx_ep;
397	struct rpcrdma_mr_seg *seg;
398	struct rpcrdma_mr *mr;
399	int nsegs, nchunks;
400	__be32 *segcount;
401
402	if (wtype != rpcrdma_writech)
403	goto done;
404
405	seg = req->rl_segments;
406	nsegs = rpcrdma_convert_iovs(r_xprt, xdrbuf: &rqst->rq_rcv_buf,
407	pos: rqst->rq_rcv_buf.head[`0`].iov_len,
408	type: wtype, seg);
409	if (nsegs < `0`)
410	return nsegs;
411
412	if (xdr_stream_encode_item_present(xdr) < `0`)
413	return -EMSGSIZE;
414	segcount = xdr_reserve_space(xdr, nbytes: sizeof(*segcount));
415	if (unlikely(!segcount))
416	return -EMSGSIZE;
417	/ Actual value encoded below /
418
419	nchunks = `0`;
420	do {
421	seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, writing: true, mr: &mr);
422	if (IS_ERR(ptr: seg))
423	return PTR_ERR(ptr: seg);
424
425	if (encode_rdma_segment(xdr, mr) < `0`)
426	return -EMSGSIZE;
427
428	trace_xprtrdma_chunk_write(task: rqst->rq_task, mr, nsegs);
429	r_xprt->rx_stats.write_chunk_count++;
430	r_xprt->rx_stats.total_rdma_request += mr->mr_length;
431	nchunks++;
432	nsegs -= mr->mr_nents;
433	} while (nsegs);
434
435	if (xdr_pad_size(n: rqst->rq_rcv_buf.page_len)) {
436	if (encode_rdma_segment(xdr, mr: ep->re_write_pad_mr) < `0`)
437	return -EMSGSIZE;
438
439	trace_xprtrdma_chunk_wp(task: rqst->rq_task, mr: ep->re_write_pad_mr,
440	nsegs);
441	r_xprt->rx_stats.write_chunk_count++;
442	r_xprt->rx_stats.total_rdma_request += mr->mr_length;
443	nchunks++;
444	nsegs -= mr->mr_nents;
445	}
446
447	/ Update count of segments in this Write chunk /
448	*segcount = cpu_to_be32(nchunks);
449
450	done:
451	if (xdr_stream_encode_item_absent(xdr) < `0`)
452	return -EMSGSIZE;
453	return `0`;
454	}
455
456	/ Register and XDR encode the Reply chunk. Supports encoding an array*
457	* of plain segments that belong to a single write (reply) chunk.
458	*
459	* Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
460	*
461	* Reply chunk (a counted array):
462	* N elements:
463	* 1 - N - HLOO - HLOO - ... - HLOO
464	*
465	* Returns zero on success, or a negative errno if a failure occurred.
466	* @xdr is advanced to the next position in the stream.
467	*/
468	static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
469	struct rpcrdma_req *req,
470	struct rpc_rqst *rqst,
471	enum rpcrdma_chunktype wtype)
472	{
473	struct xdr_stream *xdr = &req->rl_stream;
474	struct rpcrdma_mr_seg *seg;
475	struct rpcrdma_mr *mr;
476	int nsegs, nchunks;
477	__be32 *segcount;
478
479	if (wtype != rpcrdma_replych) {
480	if (xdr_stream_encode_item_absent(xdr) < `0`)
481	return -EMSGSIZE;
482	return `0`;
483	}
484
485	seg = req->rl_segments;
486	nsegs = rpcrdma_convert_iovs(r_xprt, xdrbuf: &rqst->rq_rcv_buf, pos: `0`, type: wtype, seg);
487	if (nsegs < `0`)
488	return nsegs;
489
490	if (xdr_stream_encode_item_present(xdr) < `0`)
491	return -EMSGSIZE;
492	segcount = xdr_reserve_space(xdr, nbytes: sizeof(*segcount));
493	if (unlikely(!segcount))
494	return -EMSGSIZE;
495	/ Actual value encoded below /
496
497	nchunks = `0`;
498	do {
499	seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, writing: true, mr: &mr);
500	if (IS_ERR(ptr: seg))
501	return PTR_ERR(ptr: seg);
502
503	if (encode_rdma_segment(xdr, mr) < `0`)
504	return -EMSGSIZE;
505
506	trace_xprtrdma_chunk_reply(task: rqst->rq_task, mr, nsegs);
507	r_xprt->rx_stats.reply_chunk_count++;
508	r_xprt->rx_stats.total_rdma_request += mr->mr_length;
509	nchunks++;
510	nsegs -= mr->mr_nents;
511	} while (nsegs);
512
513	/ Update count of segments in the Reply chunk /
514	*segcount = cpu_to_be32(nchunks);
515
516	return `0`;
517	}
518
519	static void rpcrdma_sendctx_done(struct kref *kref)
520	{
521	struct rpcrdma_req *req =
522	container_of(kref, struct rpcrdma_req, rl_kref);
523	struct rpcrdma_rep *rep = req->rl_reply;
524
525	rpcrdma_complete_rqst(rep);
526	rep->rr_rxprt->rx_stats.reply_waits_for_send++;
527	}
528
529	/**
530	* rpcrdma_sendctx_unmap - DMA-unmap Send buffer
531	* @sc: sendctx containing SGEs to unmap
532	*
533	*/
534	void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc)
535	{
536	struct rpcrdma_regbuf *rb = sc->sc_req->rl_sendbuf;
537	struct ib_sge *sge;
538
539	if (!sc->sc_unmap_count)
540	return;
541
542	/ The first two SGEs contain the transport header and*
543	* the inline buffer. These are always left mapped so
544	* they can be cheaply re-used.
545	*/
546	for (sge = &sc->sc_sges[`2`]; sc->sc_unmap_count;
547	++sge, --sc->sc_unmap_count)
548	ib_dma_unmap_page(dev: rdmab_device(rb), addr: sge->addr, size: sge->length,
549	direction: DMA_TO_DEVICE);
550
551	kref_put(kref: &sc->sc_req->rl_kref, release: rpcrdma_sendctx_done);
552	}
553
554	/ Prepare an SGE for the RPC-over-RDMA transport header.*
555	*/
556	static void rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt,
557	struct rpcrdma_req *req, u32 len)
558	{
559	struct rpcrdma_sendctx *sc = req->rl_sendctx;
560	struct rpcrdma_regbuf *rb = req->rl_rdmabuf;
561	struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++];
562
563	sge->addr = rdmab_addr(rb);
564	sge->length = len;
565	sge->lkey = rdmab_lkey(rb);
566
567	ib_dma_sync_single_for_device(dev: rdmab_device(rb), addr: sge->addr, size: sge->length,
568	dir: DMA_TO_DEVICE);
569	}
570
571	/ The head iovec is straightforward, as it is usually already*
572	* DMA-mapped. Sync the content that has changed.
573	*/
574	static bool rpcrdma_prepare_head_iov(struct rpcrdma_xprt *r_xprt,
575	struct rpcrdma_req req, unsigned* int len)
576	{
577	struct rpcrdma_sendctx *sc = req->rl_sendctx;
578	struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++];
579	struct rpcrdma_regbuf *rb = req->rl_sendbuf;
580
581	if (!rpcrdma_regbuf_dma_map(r_xprt, rb))
582	return false;
583
584	sge->addr = rdmab_addr(rb);
585	sge->length = len;
586	sge->lkey = rdmab_lkey(rb);
587
588	ib_dma_sync_single_for_device(dev: rdmab_device(rb), addr: sge->addr, size: sge->length,
589	dir: DMA_TO_DEVICE);
590	return true;
591	}
592
593	/ If there is a page list present, DMA map and prepare an*
594	* SGE for each page to be sent.
595	*/
596	static bool rpcrdma_prepare_pagelist(struct rpcrdma_req *req,
597	struct xdr_buf *xdr)
598	{
599	struct rpcrdma_sendctx *sc = req->rl_sendctx;
600	struct rpcrdma_regbuf *rb = req->rl_sendbuf;
601	unsigned int page_base, len, remaining;
602	struct page **ppages;
603	struct ib_sge *sge;
604
605	ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
606	page_base = offset_in_page(xdr->page_base);
607	remaining = xdr->page_len;
608	while (remaining) {
609	sge = &sc->sc_sges[req->rl_wr.num_sge++];
610	len = min_t(unsigned int, PAGE_SIZE - page_base, remaining);
611	sge->addr = ib_dma_map_page(dev: rdmab_device(rb), page: *ppages,
612	offset: page_base, size: len, direction: DMA_TO_DEVICE);
613	if (ib_dma_mapping_error(dev: rdmab_device(rb), dma_addr: sge->addr))
614	goto out_mapping_err;
615
616	sge->length = len;
617	sge->lkey = rdmab_lkey(rb);
618
619	sc->sc_unmap_count++;
620	ppages++;
621	remaining -= len;
622	page_base = `0`;
623	}
624
625	return true;
626
627	out_mapping_err:
628	trace_xprtrdma_dma_maperr(addr: sge->addr);
629	return false;
630	}
631
632	/ The tail iovec may include an XDR pad for the page list,*
633	* as well as additional content, and may not reside in the
634	* same page as the head iovec.
635	*/
636	static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req,
637	struct xdr_buf *xdr,
638	unsigned int page_base, unsigned int len)
639	{
640	struct rpcrdma_sendctx *sc = req->rl_sendctx;
641	struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++];
642	struct rpcrdma_regbuf *rb = req->rl_sendbuf;
643	struct page *page = virt_to_page(xdr->tail[`0`].iov_base);
644
645	sge->addr = ib_dma_map_page(dev: rdmab_device(rb), page, offset: page_base, size: len,
646	direction: DMA_TO_DEVICE);
647	if (ib_dma_mapping_error(dev: rdmab_device(rb), dma_addr: sge->addr))
648	goto out_mapping_err;
649
650	sge->length = len;
651	sge->lkey = rdmab_lkey(rb);
652	++sc->sc_unmap_count;
653	return true;
654
655	out_mapping_err:
656	trace_xprtrdma_dma_maperr(addr: sge->addr);
657	return false;
658	}
659
660	/ Copy the tail to the end of the head buffer.*
661	*/
662	static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt,
663	struct rpcrdma_req *req,
664	struct xdr_buf *xdr)
665	{
666	unsigned char *dst;
667
668	dst = (unsigned char *)xdr->head[`0`].iov_base;
669	dst += xdr->head[`0`].iov_len + xdr->page_len;
670	memmove(dst, xdr->tail[`0`].iov_base, xdr->tail[`0`].iov_len);
671	r_xprt->rx_stats.pullup_copy_count += xdr->tail[`0`].iov_len;
672	}
673
674	/ Copy pagelist content into the head buffer.*
675	*/
676	static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt,
677	struct rpcrdma_req *req,
678	struct xdr_buf *xdr)
679	{
680	unsigned int len, page_base, remaining;
681	struct page **ppages;
682	unsigned char src, dst;
683
684	dst = (unsigned char *)xdr->head[`0`].iov_base;
685	dst += xdr->head[`0`].iov_len;
686	ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
687	page_base = offset_in_page(xdr->page_base);
688	remaining = xdr->page_len;
689	while (remaining) {
690	src = page_address(*ppages);
691	src += page_base;
692	len = min_t(unsigned int, PAGE_SIZE - page_base, remaining);
693	memcpy(dst, src, len);
694	r_xprt->rx_stats.pullup_copy_count += len;
695
696	ppages++;
697	dst += len;
698	remaining -= len;
699	page_base = `0`;
700	}
701	}
702
703	/ Copy the contents of @xdr into @rl_sendbuf and DMA sync it.*
704	* When the head, pagelist, and tail are small, a pull-up copy
705	* is considerably less costly than DMA mapping the components
706	* of @xdr.
707	*
708	* Assumptions:
709	* - the caller has already verified that the total length
710	* of the RPC Call body will fit into @rl_sendbuf.
711	*/
712	static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt,
713	struct rpcrdma_req *req,
714	struct xdr_buf *xdr)
715	{
716	if (unlikely(xdr->tail[`0`].iov_len))
717	rpcrdma_pullup_tail_iov(r_xprt, req, xdr);
718
719	if (unlikely(xdr->page_len))
720	rpcrdma_pullup_pagelist(r_xprt, req, xdr);
721
722	/ The whole RPC message resides in the head iovec now /
723	return rpcrdma_prepare_head_iov(r_xprt, req, len: xdr->len);
724	}
725
726	static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt,
727	struct rpcrdma_req *req,
728	struct xdr_buf *xdr)
729	{
730	struct kvec *tail = &xdr->tail[`0`];
731
732	if (!rpcrdma_prepare_head_iov(r_xprt, req, len: xdr->head[`0`].iov_len))
733	return false;
734	if (xdr->page_len)
735	if (!rpcrdma_prepare_pagelist(req, xdr))
736	return false;
737	if (tail->iov_len)
738	if (!rpcrdma_prepare_tail_iov(req, xdr,
739	offset_in_page(tail->iov_base),
740	len: tail->iov_len))
741	return false;
742
743	if (req->rl_sendctx->sc_unmap_count)
744	kref_get(kref: &req->rl_kref);
745	return true;
746	}
747
748	static bool rpcrdma_prepare_readch(struct rpcrdma_xprt *r_xprt,
749	struct rpcrdma_req *req,
750	struct xdr_buf *xdr)
751	{
752	if (!rpcrdma_prepare_head_iov(r_xprt, req, len: xdr->head[`0`].iov_len))
753	return false;
754
755	/ If there is a Read chunk, the page list is being handled*
756	* via explicit RDMA, and thus is skipped here.
757	*/
758
759	/ Do not include the tail if it is only an XDR pad /
760	if (xdr->tail[`0`].iov_len > `3`) {
761	unsigned int page_base, len;
762
763	/ If the content in the page list is an odd length,*
764	* xdr_write_pages() adds a pad at the beginning of
765	* the tail iovec. Force the tail's non-pad content to
766	* land at the next XDR position in the Send message.
767	*/
768	page_base = offset_in_page(xdr->tail[`0`].iov_base);
769	len = xdr->tail[`0`].iov_len;
770	page_base += len & `3`;
771	len -= len & `3`;
772	if (!rpcrdma_prepare_tail_iov(req, xdr, page_base, len))
773	return false;
774	kref_get(kref: &req->rl_kref);
775	}
776
777	return true;
778	}
779
780	/**
781	* rpcrdma_prepare_send_sges - Construct SGEs for a Send WR
782	* @r_xprt: controlling transport
783	* @req: context of RPC Call being marshalled
784	* @hdrlen: size of transport header, in bytes
785	* @xdr: xdr_buf containing RPC Call
786	* @rtype: chunk type being encoded
787	*
788	* Returns 0 on success; otherwise a negative errno is returned.
789	*/
790	inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
791	struct rpcrdma_req *req, u32 hdrlen,
792	struct xdr_buf *xdr,
793	enum rpcrdma_chunktype rtype)
794	{
795	int ret;
796
797	ret = -EAGAIN;
798	req->rl_sendctx = rpcrdma_sendctx_get_locked(r_xprt);
799	if (!req->rl_sendctx)
800	goto out_nosc;
801	req->rl_sendctx->sc_unmap_count = `0`;
802	req->rl_sendctx->sc_req = req;
803	kref_init(kref: &req->rl_kref);
804	req->rl_wr.wr_cqe = &req->rl_sendctx->sc_cqe;
805	req->rl_wr.sg_list = req->rl_sendctx->sc_sges;
806	req->rl_wr.num_sge = `0`;
807	req->rl_wr.opcode = IB_WR_SEND;
808
809	rpcrdma_prepare_hdr_sge(r_xprt, req, len: hdrlen);
810
811	ret = -EIO;
812	switch (rtype) {
813	case rpcrdma_noch_pullup:
814	if (!rpcrdma_prepare_noch_pullup(r_xprt, req, xdr))
815	goto out_unmap;
816	break;
817	case rpcrdma_noch_mapped:
818	if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr))
819	goto out_unmap;
820	break;
821	case rpcrdma_readch:
822	if (!rpcrdma_prepare_readch(r_xprt, req, xdr))
823	goto out_unmap;
824	break;
825	case rpcrdma_areadch:
826	break;
827	default:
828	goto out_unmap;
829	}
830
831	return `0`;
832
833	out_unmap:
834	rpcrdma_sendctx_unmap(sc: req->rl_sendctx);
835	out_nosc:
836	trace_xprtrdma_prepsend_failed(rqst: &req->rl_slot, ret);
837	return ret;
838	}
839
840	/**
841	* rpcrdma_marshal_req - Marshal and send one RPC request
842	* @r_xprt: controlling transport
843	* @rqst: RPC request to be marshaled
844	*
845	* For the RPC in "rqst", this function:
846	* - Chooses the transfer mode (eg., RDMA_MSG or RDMA_NOMSG)
847	* - Registers Read, Write, and Reply chunks
848	* - Constructs the transport header
849	* - Posts a Send WR to send the transport header and request
850	*
851	* Returns:
852	* %0 if the RPC was sent successfully,
853	* %-ENOTCONN if the connection was lost,
854	* %-EAGAIN if the caller should call again with the same arguments,
855	* %-ENOBUFS if the caller should call again after a delay,
856	* %-EMSGSIZE if the transport header is too small,
857	* %-EIO if a permanent problem occurred while marshaling.
858	*/
859	int
860	rpcrdma_marshal_req(struct rpcrdma_xprt r_xprt, struct* rpc_rqst *rqst)
861	{
862	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
863	struct xdr_stream *xdr = &req->rl_stream;
864	enum rpcrdma_chunktype rtype, wtype;
865	struct xdr_buf *buf = &rqst->rq_snd_buf;
866	bool ddp_allowed;
867	__be32 *p;
868	int ret;
869
870	if (unlikely(rqst->rq_rcv_buf.flags & XDRBUF_SPARSE_PAGES)) {
871	ret = rpcrdma_alloc_sparse_pages(buf: &rqst->rq_rcv_buf);
872	if (ret)
873	return ret;
874	}
875
876	rpcrdma_set_xdrlen(xdr: &req->rl_hdrbuf, len: `0`);
877	xdr_init_encode(xdr, buf: &req->rl_hdrbuf, p: rdmab_data(rb: req->rl_rdmabuf),
878	rqst);
879
880	/ Fixed header fields /
881	ret = -EMSGSIZE;
882	p = xdr_reserve_space(xdr, nbytes: `4` * sizeof(*p));
883	if (!p)
884	goto out_err;
885	*p++ = rqst->rq_xid;
886	*p++ = rpcrdma_version;
887	*p++ = r_xprt->rx_buf.rb_max_requests;
888
889	/ When the ULP employs a GSS flavor that guarantees integrity*
890	* or privacy, direct data placement of individual data items
891	* is not allowed.
892	*/
893	ddp_allowed = !test_bit(RPCAUTH_AUTH_DATATOUCH,
894	&rqst->rq_cred->cr_auth->au_flags);
895
896	/*
897	* Chunks needed for results?
898	*
899	* o If the expected result is under the inline threshold, all ops
900	* return as inline.
901	* o Large read ops return data as write chunk(s), header as
902	* inline.
903	* o Large non-read ops return as a single reply chunk.
904	*/
905	if (rpcrdma_results_inline(r_xprt, rqst))
906	wtype = rpcrdma_noch;
907	else if ((ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ) &&
908	rpcrdma_nonpayload_inline(r_xprt, rqst))
909	wtype = rpcrdma_writech;
910	else
911	wtype = rpcrdma_replych;
912
913	/*
914	* Chunks needed for arguments?
915	*
916	* o If the total request is under the inline threshold, all ops
917	* are sent as inline.
918	* o Large write ops transmit data as read chunk(s), header as
919	* inline.
920	* o Large non-write ops are sent with the entire message as a
921	* single read chunk (protocol 0-position special case).
922	*
923	* This assumes that the upper layer does not present a request
924	* that both has a data payload, and whose non-data arguments
925	* by themselves are larger than the inline threshold.
926	*/
927	if (rpcrdma_args_inline(r_xprt, rqst)) {
928	*p++ = rdma_msg;
929	rtype = buf->len < rdmab_length(rb: req->rl_sendbuf) ?
930	rpcrdma_noch_pullup : rpcrdma_noch_mapped;
931	} else if (ddp_allowed && buf->flags & XDRBUF_WRITE) {
932	*p++ = rdma_msg;
933	rtype = rpcrdma_readch;
934	} else {
935	r_xprt->rx_stats.nomsg_call_count++;
936	*p++ = rdma_nomsg;
937	rtype = rpcrdma_areadch;
938	}
939
940	/ This implementation supports the following combinations*
941	* of chunk lists in one RPC-over-RDMA Call message:
942	*
943	* - Read list
944	* - Write list
945	* - Reply chunk
946	* - Read list + Reply chunk
947	*
948	* It might not yet support the following combinations:
949	*
950	* - Read list + Write list
951	*
952	* It does not support the following combinations:
953	*
954	* - Write list + Reply chunk
955	* - Read list + Write list + Reply chunk
956	*
957	* This implementation supports only a single chunk in each
958	* Read or Write list. Thus for example the client cannot
959	* send a Call message with a Position Zero Read chunk and a
960	* regular Read chunk at the same time.
961	*/
962	ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype);
963	if (ret)
964	goto out_err;
965	ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype);
966	if (ret)
967	goto out_err;
968	ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype);
969	if (ret)
970	goto out_err;
971
972	ret = rpcrdma_prepare_send_sges(r_xprt, req, hdrlen: req->rl_hdrbuf.len,
973	xdr: buf, rtype);
974	if (ret)
975	goto out_err;
976
977	trace_xprtrdma_marshal(req, rtype, wtype);
978	return `0`;
979
980	out_err:
981	trace_xprtrdma_marshal_failed(rqst, ret);
982	r_xprt->rx_stats.failed_marshal_count++;
983	frwr_reset(req);
984	return ret;
985	}
986
987	static void __rpcrdma_update_cwnd_locked(struct rpc_xprt *xprt,
988	struct rpcrdma_buffer *buf,
989	u32 grant)
990	{
991	buf->rb_credits = grant;
992	xprt->cwnd = grant << RPC_CWNDSHIFT;
993	}
994
995	static void rpcrdma_update_cwnd(struct rpcrdma_xprt *r_xprt, u32 grant)
996	{
997	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
998
999	spin_lock(lock: &xprt->transport_lock);
1000	__rpcrdma_update_cwnd_locked(xprt, buf: &r_xprt->rx_buf, grant);
1001	spin_unlock(lock: &xprt->transport_lock);
1002	}
1003
1004	/**
1005	* rpcrdma_reset_cwnd - Reset the xprt's congestion window
1006	* @r_xprt: controlling transport instance
1007	*
1008	* Prepare @r_xprt for the next connection by reinitializing
1009	* its credit grant to one (see RFC 8166, Section 3.3.3).
1010	*/
1011	void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt)
1012	{
1013	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
1014
1015	spin_lock(lock: &xprt->transport_lock);
1016	xprt->cong = `0`;
1017	__rpcrdma_update_cwnd_locked(xprt, buf: &r_xprt->rx_buf, grant: `1`);
1018	spin_unlock(lock: &xprt->transport_lock);
1019	}
1020
1021	/**
1022	* rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs
1023	* @rqst: controlling RPC request
1024	* @srcp: points to RPC message payload in receive buffer
1025	* @copy_len: remaining length of receive buffer content
1026	* @pad: Write chunk pad bytes needed (zero for pure inline)
1027	*
1028	* The upper layer has set the maximum number of bytes it can
1029	* receive in each component of rq_rcv_buf. These values are set in
1030	* the head.iov_len, page_len, tail.iov_len, and buflen fields.
1031	*
1032	* Unlike the TCP equivalent (xdr_partial_copy_from_skb), in
1033	* many cases this function simply updates iov_base pointers in
1034	* rq_rcv_buf to point directly to the received reply data, to
1035	* avoid copying reply data.
1036	*
1037	* Returns the count of bytes which had to be memcopied.
1038	*/
1039	static unsigned long
1040	rpcrdma_inline_fixup(struct rpc_rqst rqst, char* srcp, int* copy_len, int pad)
1041	{
1042	unsigned long fixup_copy_count;
1043	int i, npages, curlen;
1044	char *destp;
1045	struct page **ppages;
1046	int page_base;
1047
1048	/ The head iovec is redirected to the RPC reply message*
1049	* in the receive buffer, to avoid a memcopy.
1050	*/
1051	rqst->rq_rcv_buf.head[`0`].iov_base = srcp;
1052	rqst->rq_private_buf.head[`0`].iov_base = srcp;
1053
1054	/ The contents of the receive buffer that follow*
1055	* head.iov_len bytes are copied into the page list.
1056	*/
1057	curlen = rqst->rq_rcv_buf.head[`0`].iov_len;
1058	if (curlen > copy_len)
1059	curlen = copy_len;
1060	srcp += curlen;
1061	copy_len -= curlen;
1062
1063	ppages = rqst->rq_rcv_buf.pages +
1064	(rqst->rq_rcv_buf.page_base >> PAGE_SHIFT);
1065	page_base = offset_in_page(rqst->rq_rcv_buf.page_base);
1066	fixup_copy_count = `0`;
1067	if (copy_len && rqst->rq_rcv_buf.page_len) {
1068	int pagelist_len;
1069
1070	pagelist_len = rqst->rq_rcv_buf.page_len;
1071	if (pagelist_len > copy_len)
1072	pagelist_len = copy_len;
1073	npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT;
1074	for (i = `0`; i < npages; i++) {
1075	curlen = PAGE_SIZE - page_base;
1076	if (curlen > pagelist_len)
1077	curlen = pagelist_len;
1078
1079	destp = kmap_atomic(page: ppages[i]);
1080	memcpy(destp + page_base, srcp, curlen);
1081	flush_dcache_page(page: ppages[i]);
1082	kunmap_atomic(destp);
1083	srcp += curlen;
1084	copy_len -= curlen;
1085	fixup_copy_count += curlen;
1086	pagelist_len -= curlen;
1087	if (!pagelist_len)
1088	break;
1089	page_base = `0`;
1090	}
1091
1092	/ Implicit padding for the last segment in a Write*
1093	* chunk is inserted inline at the front of the tail
1094	* iovec. The upper layer ignores the content of
1095	* the pad. Simply ensure inline content in the tail
1096	* that follows the Write chunk is properly aligned.
1097	*/
1098	if (pad)
1099	srcp -= pad;
1100	}
1101
1102	/ The tail iovec is redirected to the remaining data*
1103	* in the receive buffer, to avoid a memcopy.
1104	*/
1105	if (copy_len \|\| pad) {
1106	rqst->rq_rcv_buf.tail[`0`].iov_base = srcp;
1107	rqst->rq_private_buf.tail[`0`].iov_base = srcp;
1108	}
1109
1110	if (fixup_copy_count)
1111	trace_xprtrdma_fixup(rqst, fixup: fixup_copy_count);
1112	return fixup_copy_count;
1113	}
1114
1115	/ By convention, backchannel calls arrive via rdma_msg type*
1116	* messages, and never populate the chunk lists. This makes
1117	* the RPC/RDMA header small and fixed in size, so it is
1118	* straightforward to check the RPC header's direction field.
1119	*/
1120	static bool
1121	rpcrdma_is_bcall(struct rpcrdma_xprt r_xprt, struct* rpcrdma_rep *rep)
1122	#if defined(CONFIG_SUNRPC_BACKCHANNEL)
1123	{
1124	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
1125	struct xdr_stream *xdr = &rep->rr_stream;
1126	__be32 *p;
1127
1128	if (rep->rr_proc != rdma_msg)
1129	return false;
1130
1131	/ Peek at stream contents without advancing. /
1132	p = xdr_inline_decode(xdr, nbytes: `0`);
1133
1134	/ Chunk lists /
1135	if (xdr_item_is_present(p: p++))
1136	return false;
1137	if (xdr_item_is_present(p: p++))
1138	return false;
1139	if (xdr_item_is_present(p: p++))
1140	return false;
1141
1142	/ RPC header /
1143	if (*p++ != rep->rr_xid)
1144	return false;
1145	if (*p != cpu_to_be32(RPC_CALL))
1146	return false;
1147
1148	/ No bc service. /
1149	if (xprt->bc_serv == NULL)
1150	return false;
1151
1152	/ Now that we are sure this is a backchannel call,*
1153	* advance to the RPC header.
1154	*/
1155	p = xdr_inline_decode(xdr, nbytes: `3` * sizeof(*p));
1156	if (unlikely(!p))
1157	return true;
1158
1159	rpcrdma_bc_receive_call(r_xprt, rep);
1160	return true;
1161	}
1162	#else /* CONFIG_SUNRPC_BACKCHANNEL */
1163	{
1164	return false;
1165	}
1166	#endif /* CONFIG_SUNRPC_BACKCHANNEL */
1167
1168	static int decode_rdma_segment(struct xdr_stream xdr, u32 length)
1169	{
1170	u32 handle;
1171	u64 offset;
1172	__be32 *p;
1173
1174	p = xdr_inline_decode(xdr, nbytes: `4` * sizeof(*p));
1175	if (unlikely(!p))
1176	return -EIO;
1177
1178	xdr_decode_rdma_segment(p, handle: &handle, length, offset: &offset);
1179	trace_xprtrdma_decode_seg(handle, length: *length, offset);
1180	return `0`;
1181	}
1182
1183	static int decode_write_chunk(struct xdr_stream xdr, u32 length)
1184	{
1185	u32 segcount, seglength;
1186	__be32 *p;
1187
1188	p = xdr_inline_decode(xdr, nbytes: sizeof(*p));
1189	if (unlikely(!p))
1190	return -EIO;
1191
1192	*length = `0`;
1193	segcount = be32_to_cpup(p);
1194	while (segcount--) {
1195	if (decode_rdma_segment(xdr, length: &seglength))
1196	return -EIO;
1197	*length += seglength;
1198	}
1199
1200	return `0`;
1201	}
1202
1203	/ In RPC-over-RDMA Version One replies, a Read list is never*
1204	* expected. This decoder is a stub that returns an error if
1205	* a Read list is present.
1206	*/
1207	static int decode_read_list(struct xdr_stream *xdr)
1208	{
1209	__be32 *p;
1210
1211	p = xdr_inline_decode(xdr, nbytes: sizeof(*p));
1212	if (unlikely(!p))
1213	return -EIO;
1214	if (unlikely(xdr_item_is_present(p)))
1215	return -EIO;
1216	return `0`;
1217	}
1218
1219	/ Supports only one Write chunk in the Write list*
1220	*/
1221	static int decode_write_list(struct xdr_stream xdr, u32 length)
1222	{
1223	u32 chunklen;
1224	bool first;
1225	__be32 *p;
1226
1227	*length = `0`;
1228	first = true;
1229	do {
1230	p = xdr_inline_decode(xdr, nbytes: sizeof(*p));
1231	if (unlikely(!p))
1232	return -EIO;
1233	if (xdr_item_is_absent(p))
1234	break;
1235	if (!first)
1236	return -EIO;
1237
1238	if (decode_write_chunk(xdr, length: &chunklen))
1239	return -EIO;
1240	*length += chunklen;
1241	first = false;
1242	} while (true);
1243	return `0`;
1244	}
1245
1246	static int decode_reply_chunk(struct xdr_stream xdr, u32 length)
1247	{
1248	__be32 *p;
1249
1250	p = xdr_inline_decode(xdr, nbytes: sizeof(*p));
1251	if (unlikely(!p))
1252	return -EIO;
1253
1254	*length = `0`;
1255	if (xdr_item_is_present(p))
1256	if (decode_write_chunk(xdr, length))
1257	return -EIO;
1258	return `0`;
1259	}
1260
1261	static int
1262	rpcrdma_decode_msg(struct rpcrdma_xprt r_xprt, struct* rpcrdma_rep *rep,
1263	struct rpc_rqst *rqst)
1264	{
1265	struct xdr_stream *xdr = &rep->rr_stream;
1266	u32 writelist, replychunk, rpclen;
1267	char *base;
1268
1269	/ Decode the chunk lists /
1270	if (decode_read_list(xdr))
1271	return -EIO;
1272	if (decode_write_list(xdr, length: &writelist))
1273	return -EIO;
1274	if (decode_reply_chunk(xdr, length: &replychunk))
1275	return -EIO;
1276
1277	/ RDMA_MSG sanity checks /
1278	if (unlikely(replychunk))
1279	return -EIO;
1280
1281	/ Build the RPC reply's Payload stream in rqst->rq_rcv_buf /
1282	base = (char *)xdr_inline_decode(xdr, nbytes: `0`);
1283	rpclen = xdr_stream_remaining(xdr);
1284	r_xprt->rx_stats.fixup_copy_count +=
1285	rpcrdma_inline_fixup(rqst, srcp: base, copy_len: rpclen, pad: writelist & `3`);
1286
1287	r_xprt->rx_stats.total_rdma_reply += writelist;
1288	return rpclen + xdr_align_size(n: writelist);
1289	}
1290
1291	static noinline int
1292	rpcrdma_decode_nomsg(struct rpcrdma_xprt r_xprt, struct* rpcrdma_rep *rep)
1293	{
1294	struct xdr_stream *xdr = &rep->rr_stream;
1295	u32 writelist, replychunk;
1296
1297	/ Decode the chunk lists /
1298	if (decode_read_list(xdr))
1299	return -EIO;
1300	if (decode_write_list(xdr, length: &writelist))
1301	return -EIO;
1302	if (decode_reply_chunk(xdr, length: &replychunk))
1303	return -EIO;
1304
1305	/ RDMA_NOMSG sanity checks /
1306	if (unlikely(writelist))
1307	return -EIO;
1308	if (unlikely(!replychunk))
1309	return -EIO;
1310
1311	/ Reply chunk buffer already is the reply vector /
1312	r_xprt->rx_stats.total_rdma_reply += replychunk;
1313	return replychunk;
1314	}
1315
1316	static noinline int
1317	rpcrdma_decode_error(struct rpcrdma_xprt r_xprt, struct* rpcrdma_rep *rep,
1318	struct rpc_rqst *rqst)
1319	{
1320	struct xdr_stream *xdr = &rep->rr_stream;
1321	__be32 *p;
1322
1323	p = xdr_inline_decode(xdr, nbytes: sizeof(*p));
1324	if (unlikely(!p))
1325	return -EIO;
1326
1327	switch (*p) {
1328	case err_vers:
1329	p = xdr_inline_decode(xdr, nbytes: `2` * sizeof(*p));
1330	if (!p)
1331	break;
1332	trace_xprtrdma_err_vers(rqst, min: p, max: p + `1`);
1333	break;
1334	case err_chunk:
1335	trace_xprtrdma_err_chunk(rqst);
1336	break;
1337	default:
1338	trace_xprtrdma_err_unrecognized(rqst, procedure: p);
1339	}
1340
1341	return -EIO;
1342	}
1343
1344	/**
1345	* rpcrdma_unpin_rqst - Release rqst without completing it
1346	* @rep: RPC/RDMA Receive context
1347	*
1348	* This is done when a connection is lost so that a Reply
1349	* can be dropped and its matching Call can be subsequently
1350	* retransmitted on a new connection.
1351	*/
1352	void rpcrdma_unpin_rqst(struct rpcrdma_rep *rep)
1353	{
1354	struct rpc_xprt *xprt = &rep->rr_rxprt->rx_xprt;
1355	struct rpc_rqst *rqst = rep->rr_rqst;
1356	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
1357
1358	req->rl_reply = NULL;
1359	rep->rr_rqst = NULL;
1360
1361	spin_lock(lock: &xprt->queue_lock);
1362	xprt_unpin_rqst(req: rqst);
1363	spin_unlock(lock: &xprt->queue_lock);
1364	}
1365
1366	/**
1367	* rpcrdma_complete_rqst - Pass completed rqst back to RPC
1368	* @rep: RPC/RDMA Receive context
1369	*
1370	* Reconstruct the RPC reply and complete the transaction
1371	* while @rqst is still pinned to ensure the rep, rqst, and
1372	* rq_task pointers remain stable.
1373	*/
1374	void rpcrdma_complete_rqst(struct rpcrdma_rep *rep)
1375	{
1376	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
1377	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
1378	struct rpc_rqst *rqst = rep->rr_rqst;
1379	int status;
1380
1381	switch (rep->rr_proc) {
1382	case rdma_msg:
1383	status = rpcrdma_decode_msg(r_xprt, rep, rqst);
1384	break;
1385	case rdma_nomsg:
1386	status = rpcrdma_decode_nomsg(r_xprt, rep);
1387	break;
1388	case rdma_error:
1389	status = rpcrdma_decode_error(r_xprt, rep, rqst);
1390	break;
1391	default:
1392	status = -EIO;
1393	}
1394	if (status < `0`)
1395	goto out_badheader;
1396
1397	out:
1398	spin_lock(lock: &xprt->queue_lock);
1399	xprt_complete_rqst(task: rqst->rq_task, copied: status);
1400	xprt_unpin_rqst(req: rqst);
1401	spin_unlock(lock: &xprt->queue_lock);
1402	return;
1403
1404	out_badheader:
1405	trace_xprtrdma_reply_hdr_err(rep);
1406	r_xprt->rx_stats.bad_reply_count++;
1407	rqst->rq_task->tk_status = status;
1408	status = `0`;
1409	goto out;
1410	}
1411
1412	static void rpcrdma_reply_done(struct kref *kref)
1413	{
1414	struct rpcrdma_req *req =
1415	container_of(kref, struct rpcrdma_req, rl_kref);
1416
1417	rpcrdma_complete_rqst(rep: req->rl_reply);
1418	}
1419
1420	/**
1421	* rpcrdma_reply_handler - Process received RPC/RDMA messages
1422	* @rep: Incoming rpcrdma_rep object to process
1423	*
1424	* Errors must result in the RPC task either being awakened, or
1425	* allowed to timeout, to discover the errors at that time.
1426	*/
1427	void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
1428	{
1429	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
1430	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
1431	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1432	struct rpcrdma_req *req;
1433	struct rpc_rqst *rqst;
1434	u32 credits;
1435	__be32 *p;
1436
1437	/ Any data means we had a useful conversation, so*
1438	* then we don't need to delay the next reconnect.
1439	*/
1440	if (xprt->reestablish_timeout)
1441	xprt->reestablish_timeout = `0`;
1442
1443	/ Fixed transport header fields /
1444	xdr_init_decode(xdr: &rep->rr_stream, buf: &rep->rr_hdrbuf,
1445	p: rep->rr_hdrbuf.head[`0`].iov_base, NULL);
1446	p = xdr_inline_decode(xdr: &rep->rr_stream, nbytes: `4` * sizeof(*p));
1447	if (unlikely(!p))
1448	goto out_shortreply;
1449	rep->rr_xid = *p++;
1450	rep->rr_vers = *p++;
1451	credits = be32_to_cpu(*p++);
1452	rep->rr_proc = *p++;
1453
1454	if (rep->rr_vers != rpcrdma_version)
1455	goto out_badversion;
1456
1457	if (rpcrdma_is_bcall(r_xprt, rep))
1458	return;
1459
1460	/ Match incoming rpcrdma_rep to an rpcrdma_req to*
1461	* get context for handling any incoming chunks.
1462	*/
1463	spin_lock(lock: &xprt->queue_lock);
1464	rqst = xprt_lookup_rqst(xprt, xid: rep->rr_xid);
1465	if (!rqst)
1466	goto out_norqst;
1467	xprt_pin_rqst(req: rqst);
1468	spin_unlock(lock: &xprt->queue_lock);
1469
1470	if (credits == `0`)
1471	credits = `1`; / don't deadlock /
1472	else if (credits > r_xprt->rx_ep->re_max_requests)
1473	credits = r_xprt->rx_ep->re_max_requests;
1474	rpcrdma_post_recvs(r_xprt, needed: credits + (buf->rb_bc_srv_max_requests << `1`),
1475	temp: false);
1476	if (buf->rb_credits != credits)
1477	rpcrdma_update_cwnd(r_xprt, grant: credits);
1478
1479	req = rpcr_to_rdmar(rqst);
1480	if (unlikely(req->rl_reply))
1481	rpcrdma_rep_put(buf, rep: req->rl_reply);
1482	req->rl_reply = rep;
1483	rep->rr_rqst = rqst;
1484
1485	trace_xprtrdma_reply(task: rqst->rq_task, rep, credits);
1486
1487	if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)
1488	frwr_reminv(rep, mrs: &req->rl_registered);
1489	if (!list_empty(head: &req->rl_registered))
1490	frwr_unmap_async(r_xprt, req);
1491	/ LocalInv completion will complete the RPC /
1492	else
1493	kref_put(kref: &req->rl_kref, release: rpcrdma_reply_done);
1494	return;
1495
1496	out_badversion:
1497	trace_xprtrdma_reply_vers_err(rep);
1498	goto out;
1499
1500	out_norqst:
1501	spin_unlock(lock: &xprt->queue_lock);
1502	trace_xprtrdma_reply_rqst_err(rep);
1503	goto out;
1504
1505	out_shortreply:
1506	trace_xprtrdma_reply_short_err(rep);
1507
1508	out:
1509	rpcrdma_rep_put(buf, rep);
1510	}
1511

source code of linux/net/sunrpc/xprtrdma/rpc_rdma.c