Commit | Line | Data |
---|---|---|
f58851e6 | 1 | /* |
e9601828 TT |
2 | * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. |
3 | * | |
4 | * This software is available to you under a choice of one of two | |
5 | * licenses. You may choose to be licensed under the terms of the GNU | |
6 | * General Public License (GPL) Version 2, available from the file | |
7 | * COPYING in the main directory of this source tree, or the BSD-type | |
8 | * license below: | |
9 | * | |
10 | * Redistribution and use in source and binary forms, with or without | |
11 | * modification, are permitted provided that the following conditions | |
12 | * are met: | |
13 | * | |
14 | * Redistributions of source code must retain the above copyright | |
15 | * notice, this list of conditions and the following disclaimer. | |
16 | * | |
17 | * Redistributions in binary form must reproduce the above | |
18 | * copyright notice, this list of conditions and the following | |
19 | * disclaimer in the documentation and/or other materials provided | |
20 | * with the distribution. | |
21 | * | |
22 | * Neither the name of the Network Appliance, Inc. nor the names of | |
23 | * its contributors may be used to endorse or promote products | |
24 | * derived from this software without specific prior written | |
25 | * permission. | |
26 | * | |
27 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
28 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
29 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
30 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
31 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
32 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
33 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
34 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
35 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
36 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
37 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
38 | */ | |
39 | ||
40 | /* | |
41 | * rpc_rdma.c | |
42 | * | |
43 | * This file contains the guts of the RPC RDMA protocol, and | |
44 | * does marshaling/unmarshaling, etc. It is also where interfacing | |
45 | * to the Linux RPC framework lives. | |
f58851e6 TT |
46 | */ |
47 | ||
48 | #include "xprt_rdma.h" | |
49 | ||
e9601828 TT |
50 | #include <linux/highmem.h> |
51 | ||
f895b252 | 52 | #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) |
e9601828 TT |
53 | # define RPCDBG_FACILITY RPCDBG_TRANS |
54 | #endif | |
55 | ||
e2377945 CL |
56 | enum rpcrdma_chunktype { |
57 | rpcrdma_noch = 0, | |
58 | rpcrdma_readch, | |
59 | rpcrdma_areadch, | |
60 | rpcrdma_writech, | |
61 | rpcrdma_replych | |
62 | }; | |
63 | ||
e9601828 | 64 | static const char transfertypes[][12] = { |
94f58c58 CL |
65 | "inline", /* no chunks */ |
66 | "read list", /* some argument via rdma read */ | |
67 | "*read list", /* entire request via rdma read */ | |
68 | "write list", /* some result via rdma write */ | |
e9601828 TT |
69 | "reply chunk" /* entire reply via rdma write */ |
70 | }; | |
302d3deb CL |
71 | |
72 | /* Returns size of largest RPC-over-RDMA header in a Call message | |
73 | * | |
94f58c58 CL |
74 | * The largest Call header contains a full-size Read list and a |
75 | * minimal Reply chunk. | |
302d3deb CL |
76 | */ |
77 | static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs) | |
78 | { | |
79 | unsigned int size; | |
80 | ||
81 | /* Fixed header fields and list discriminators */ | |
82 | size = RPCRDMA_HDRLEN_MIN; | |
83 | ||
84 | /* Maximum Read list size */ | |
85 | maxsegs += 2; /* segment for head and tail buffers */ | |
86 | size = maxsegs * sizeof(struct rpcrdma_read_chunk); | |
87 | ||
94f58c58 CL |
88 | /* Minimal Read chunk size */ |
89 | size += sizeof(__be32); /* segment count */ | |
90 | size += sizeof(struct rpcrdma_segment); | |
91 | size += sizeof(__be32); /* list discriminator */ | |
92 | ||
302d3deb CL |
93 | dprintk("RPC: %s: max call header size = %u\n", |
94 | __func__, size); | |
95 | return size; | |
96 | } | |
97 | ||
98 | /* Returns size of largest RPC-over-RDMA header in a Reply message | |
99 | * | |
100 | * There is only one Write list or one Reply chunk per Reply | |
101 | * message. The larger list is the Write list. | |
102 | */ | |
103 | static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) | |
104 | { | |
105 | unsigned int size; | |
106 | ||
107 | /* Fixed header fields and list discriminators */ | |
108 | size = RPCRDMA_HDRLEN_MIN; | |
109 | ||
110 | /* Maximum Write list size */ | |
111 | maxsegs += 2; /* segment for head and tail buffers */ | |
112 | size = sizeof(__be32); /* segment count */ | |
113 | size += maxsegs * sizeof(struct rpcrdma_segment); | |
114 | size += sizeof(__be32); /* list discriminator */ | |
115 | ||
116 | dprintk("RPC: %s: max reply header size = %u\n", | |
117 | __func__, size); | |
118 | return size; | |
119 | } | |
120 | ||
121 | void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *ia, | |
122 | struct rpcrdma_create_data_internal *cdata, | |
123 | unsigned int maxsegs) | |
124 | { | |
125 | ia->ri_max_inline_write = cdata->inline_wsize - | |
126 | rpcrdma_max_call_header_size(maxsegs); | |
127 | ia->ri_max_inline_read = cdata->inline_rsize - | |
128 | rpcrdma_max_reply_header_size(maxsegs); | |
129 | } | |
e9601828 | 130 | |
5457ced0 CL |
131 | /* The client can send a request inline as long as the RPCRDMA header |
132 | * plus the RPC call fit under the transport's inline limit. If the | |
133 | * combined call message size exceeds that limit, the client must use | |
134 | * the read chunk list for this operation. | |
135 | */ | |
302d3deb CL |
136 | static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, |
137 | struct rpc_rqst *rqst) | |
5457ced0 | 138 | { |
302d3deb | 139 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; |
5457ced0 | 140 | |
302d3deb | 141 | return rqst->rq_snd_buf.len <= ia->ri_max_inline_write; |
5457ced0 CL |
142 | } |
143 | ||
144 | /* The client can't know how large the actual reply will be. Thus it | |
145 | * plans for the largest possible reply for that particular ULP | |
146 | * operation. If the maximum combined reply message size exceeds that | |
147 | * limit, the client must provide a write list or a reply chunk for | |
148 | * this request. | |
149 | */ | |
302d3deb CL |
150 | static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt, |
151 | struct rpc_rqst *rqst) | |
5457ced0 | 152 | { |
302d3deb | 153 | struct rpcrdma_ia *ia = &r_xprt->rx_ia; |
5457ced0 | 154 | |
302d3deb | 155 | return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read; |
5457ced0 CL |
156 | } |
157 | ||
677eb17e CL |
158 | static int |
159 | rpcrdma_tail_pullup(struct xdr_buf *buf) | |
160 | { | |
161 | size_t tlen = buf->tail[0].iov_len; | |
162 | size_t skip = tlen & 3; | |
163 | ||
164 | /* Do not include the tail if it is only an XDR pad */ | |
165 | if (tlen < 4) | |
166 | return 0; | |
167 | ||
168 | /* xdr_write_pages() adds a pad at the beginning of the tail | |
169 | * if the content in "buf->pages" is unaligned. Force the | |
170 | * tail's actual content to land at the next XDR position | |
171 | * after the head instead. | |
172 | */ | |
173 | if (skip) { | |
174 | unsigned char *src, *dst; | |
175 | unsigned int count; | |
176 | ||
177 | src = buf->tail[0].iov_base; | |
178 | dst = buf->head[0].iov_base; | |
179 | dst += buf->head[0].iov_len; | |
180 | ||
181 | src += skip; | |
182 | tlen -= skip; | |
183 | ||
184 | dprintk("RPC: %s: skip=%zu, memmove(%p, %p, %zu)\n", | |
185 | __func__, skip, dst, src, tlen); | |
186 | ||
187 | for (count = tlen; count; count--) | |
188 | *dst++ = *src++; | |
189 | } | |
190 | ||
191 | return tlen; | |
192 | } | |
193 | ||
821c791a CL |
194 | /* Split "vec" on page boundaries into segments. FMR registers pages, |
195 | * not a byte range. Other modes coalesce these segments into a single | |
196 | * MR when they can. | |
197 | */ | |
198 | static int | |
5ab81428 | 199 | rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n) |
821c791a CL |
200 | { |
201 | size_t page_offset; | |
202 | u32 remaining; | |
203 | char *base; | |
204 | ||
205 | base = vec->iov_base; | |
206 | page_offset = offset_in_page(base); | |
207 | remaining = vec->iov_len; | |
5ab81428 | 208 | while (remaining && n < RPCRDMA_MAX_SEGS) { |
821c791a CL |
209 | seg[n].mr_page = NULL; |
210 | seg[n].mr_offset = base; | |
211 | seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); | |
212 | remaining -= seg[n].mr_len; | |
213 | base += seg[n].mr_len; | |
214 | ++n; | |
215 | page_offset = 0; | |
216 | } | |
217 | return n; | |
218 | } | |
219 | ||
e9601828 TT |
220 | /* |
221 | * Chunk assembly from upper layer xdr_buf. | |
222 | * | |
223 | * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk | |
224 | * elements. Segments are then coalesced when registered, if possible | |
225 | * within the selected memreg mode. | |
c93c6223 CL |
226 | * |
227 | * Returns positive number of segments converted, or a negative errno. | |
e9601828 TT |
228 | */ |
229 | ||
230 | static int | |
2a428b2b | 231 | rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, |
5ab81428 | 232 | enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg) |
e9601828 | 233 | { |
5ab81428 | 234 | int len, n, p, page_base; |
bd7ea31b | 235 | struct page **ppages; |
e9601828 | 236 | |
5ab81428 | 237 | n = 0; |
821c791a | 238 | if (pos == 0) { |
5ab81428 CL |
239 | n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n); |
240 | if (n == RPCRDMA_MAX_SEGS) | |
241 | goto out_overflow; | |
e9601828 TT |
242 | } |
243 | ||
bd7ea31b TT |
244 | len = xdrbuf->page_len; |
245 | ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); | |
246 | page_base = xdrbuf->page_base & ~PAGE_MASK; | |
247 | p = 0; | |
5ab81428 | 248 | while (len && n < RPCRDMA_MAX_SEGS) { |
196c6998 SM |
249 | if (!ppages[p]) { |
250 | /* alloc the pagelist for receiving buffer */ | |
251 | ppages[p] = alloc_page(GFP_ATOMIC); | |
252 | if (!ppages[p]) | |
7a89f9c6 | 253 | return -EAGAIN; |
196c6998 | 254 | } |
bd7ea31b TT |
255 | seg[n].mr_page = ppages[p]; |
256 | seg[n].mr_offset = (void *)(unsigned long) page_base; | |
257 | seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); | |
c93c6223 | 258 | if (seg[n].mr_len > PAGE_SIZE) |
5ab81428 | 259 | goto out_overflow; |
bd7ea31b | 260 | len -= seg[n].mr_len; |
e9601828 | 261 | ++n; |
bd7ea31b TT |
262 | ++p; |
263 | page_base = 0; /* page offset only applies to first page */ | |
e9601828 TT |
264 | } |
265 | ||
bd7ea31b | 266 | /* Message overflows the seg array */ |
5ab81428 CL |
267 | if (len && n == RPCRDMA_MAX_SEGS) |
268 | goto out_overflow; | |
bd7ea31b | 269 | |
677eb17e CL |
270 | /* When encoding the read list, the tail is always sent inline */ |
271 | if (type == rpcrdma_readch) | |
272 | return n; | |
273 | ||
50e1092b | 274 | if (xdrbuf->tail[0].iov_len) { |
9191ca3b TT |
275 | /* the rpcrdma protocol allows us to omit any trailing |
276 | * xdr pad bytes, saving the server an RDMA operation. */ | |
277 | if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) | |
278 | return n; | |
5ab81428 CL |
279 | n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n); |
280 | if (n == RPCRDMA_MAX_SEGS) | |
281 | goto out_overflow; | |
e9601828 TT |
282 | } |
283 | ||
e9601828 | 284 | return n; |
5ab81428 CL |
285 | |
286 | out_overflow: | |
287 | pr_err("rpcrdma: segment array overflow\n"); | |
288 | return -EIO; | |
e9601828 TT |
289 | } |
290 | ||
94f58c58 | 291 | static inline __be32 * |
9d6b0409 | 292 | xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw) |
94f58c58 | 293 | { |
9d6b0409 CL |
294 | *iptr++ = cpu_to_be32(mw->mw_handle); |
295 | *iptr++ = cpu_to_be32(mw->mw_length); | |
296 | return xdr_encode_hyper(iptr, mw->mw_offset); | |
94f58c58 CL |
297 | } |
298 | ||
299 | /* XDR-encode the Read list. Supports encoding a list of read | |
300 | * segments that belong to a single read chunk. | |
301 | * | |
302 | * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): | |
303 | * | |
304 | * Read chunklist (a linked list): | |
305 | * N elements, position P (same P for all chunks of same arg!): | |
306 | * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0 | |
307 | * | |
308 | * Returns a pointer to the XDR word in the RDMA header following | |
309 | * the end of the Read list, or an error pointer. | |
310 | */ | |
311 | static __be32 * | |
312 | rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, | |
313 | struct rpcrdma_req *req, struct rpc_rqst *rqst, | |
314 | __be32 *iptr, enum rpcrdma_chunktype rtype) | |
315 | { | |
5ab81428 | 316 | struct rpcrdma_mr_seg *seg; |
9d6b0409 | 317 | struct rpcrdma_mw *mw; |
94f58c58 CL |
318 | unsigned int pos; |
319 | int n, nsegs; | |
320 | ||
321 | if (rtype == rpcrdma_noch) { | |
322 | *iptr++ = xdr_zero; /* item not present */ | |
323 | return iptr; | |
324 | } | |
325 | ||
326 | pos = rqst->rq_snd_buf.head[0].iov_len; | |
327 | if (rtype == rpcrdma_areadch) | |
328 | pos = 0; | |
5ab81428 CL |
329 | seg = req->rl_segments; |
330 | nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg); | |
94f58c58 CL |
331 | if (nsegs < 0) |
332 | return ERR_PTR(nsegs); | |
333 | ||
334 | do { | |
9d6b0409 CL |
335 | n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, |
336 | false, &mw); | |
a54d4059 | 337 | if (n < 0) |
94f58c58 | 338 | return ERR_PTR(n); |
9d6b0409 | 339 | list_add(&mw->mw_list, &req->rl_registered); |
94f58c58 CL |
340 | |
341 | *iptr++ = xdr_one; /* item present */ | |
342 | ||
343 | /* All read segments in this chunk | |
344 | * have the same "position". | |
345 | */ | |
346 | *iptr++ = cpu_to_be32(pos); | |
9d6b0409 | 347 | iptr = xdr_encode_rdma_segment(iptr, mw); |
94f58c58 | 348 | |
9d6b0409 | 349 | dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n", |
94f58c58 | 350 | rqst->rq_task->tk_pid, __func__, pos, |
9d6b0409 CL |
351 | mw->mw_length, (unsigned long long)mw->mw_offset, |
352 | mw->mw_handle, n < nsegs ? "more" : "last"); | |
94f58c58 CL |
353 | |
354 | r_xprt->rx_stats.read_chunk_count++; | |
94f58c58 CL |
355 | seg += n; |
356 | nsegs -= n; | |
357 | } while (nsegs); | |
94f58c58 CL |
358 | |
359 | /* Finish Read list */ | |
360 | *iptr++ = xdr_zero; /* Next item not present */ | |
361 | return iptr; | |
362 | } | |
363 | ||
364 | /* XDR-encode the Write list. Supports encoding a list containing | |
365 | * one array of plain segments that belong to a single write chunk. | |
366 | * | |
367 | * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): | |
368 | * | |
369 | * Write chunklist (a list of (one) counted array): | |
370 | * N elements: | |
371 | * 1 - N - HLOO - HLOO - ... - HLOO - 0 | |
372 | * | |
373 | * Returns a pointer to the XDR word in the RDMA header following | |
374 | * the end of the Write list, or an error pointer. | |
375 | */ | |
376 | static __be32 * | |
377 | rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, | |
378 | struct rpc_rqst *rqst, __be32 *iptr, | |
379 | enum rpcrdma_chunktype wtype) | |
380 | { | |
5ab81428 | 381 | struct rpcrdma_mr_seg *seg; |
9d6b0409 | 382 | struct rpcrdma_mw *mw; |
94f58c58 CL |
383 | int n, nsegs, nchunks; |
384 | __be32 *segcount; | |
385 | ||
386 | if (wtype != rpcrdma_writech) { | |
387 | *iptr++ = xdr_zero; /* no Write list present */ | |
388 | return iptr; | |
389 | } | |
390 | ||
5ab81428 | 391 | seg = req->rl_segments; |
94f58c58 CL |
392 | nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, |
393 | rqst->rq_rcv_buf.head[0].iov_len, | |
5ab81428 | 394 | wtype, seg); |
94f58c58 CL |
395 | if (nsegs < 0) |
396 | return ERR_PTR(nsegs); | |
397 | ||
398 | *iptr++ = xdr_one; /* Write list present */ | |
399 | segcount = iptr++; /* save location of segment count */ | |
400 | ||
401 | nchunks = 0; | |
402 | do { | |
9d6b0409 CL |
403 | n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, |
404 | true, &mw); | |
a54d4059 | 405 | if (n < 0) |
94f58c58 | 406 | return ERR_PTR(n); |
9d6b0409 | 407 | list_add(&mw->mw_list, &req->rl_registered); |
94f58c58 | 408 | |
9d6b0409 | 409 | iptr = xdr_encode_rdma_segment(iptr, mw); |
94f58c58 | 410 | |
9d6b0409 | 411 | dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n", |
94f58c58 | 412 | rqst->rq_task->tk_pid, __func__, |
9d6b0409 CL |
413 | mw->mw_length, (unsigned long long)mw->mw_offset, |
414 | mw->mw_handle, n < nsegs ? "more" : "last"); | |
94f58c58 CL |
415 | |
416 | r_xprt->rx_stats.write_chunk_count++; | |
417 | r_xprt->rx_stats.total_rdma_request += seg->mr_len; | |
94f58c58 CL |
418 | nchunks++; |
419 | seg += n; | |
420 | nsegs -= n; | |
421 | } while (nsegs); | |
94f58c58 CL |
422 | |
423 | /* Update count of segments in this Write chunk */ | |
424 | *segcount = cpu_to_be32(nchunks); | |
425 | ||
426 | /* Finish Write list */ | |
427 | *iptr++ = xdr_zero; /* Next item not present */ | |
428 | return iptr; | |
429 | } | |
430 | ||
431 | /* XDR-encode the Reply chunk. Supports encoding an array of plain | |
432 | * segments that belong to a single write (reply) chunk. | |
433 | * | |
434 | * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): | |
435 | * | |
436 | * Reply chunk (a counted array): | |
437 | * N elements: | |
438 | * 1 - N - HLOO - HLOO - ... - HLOO | |
439 | * | |
440 | * Returns a pointer to the XDR word in the RDMA header following | |
441 | * the end of the Reply chunk, or an error pointer. | |
442 | */ | |
443 | static __be32 * | |
444 | rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, | |
445 | struct rpcrdma_req *req, struct rpc_rqst *rqst, | |
446 | __be32 *iptr, enum rpcrdma_chunktype wtype) | |
447 | { | |
5ab81428 | 448 | struct rpcrdma_mr_seg *seg; |
9d6b0409 | 449 | struct rpcrdma_mw *mw; |
94f58c58 CL |
450 | int n, nsegs, nchunks; |
451 | __be32 *segcount; | |
452 | ||
453 | if (wtype != rpcrdma_replych) { | |
454 | *iptr++ = xdr_zero; /* no Reply chunk present */ | |
455 | return iptr; | |
456 | } | |
457 | ||
5ab81428 CL |
458 | seg = req->rl_segments; |
459 | nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg); | |
94f58c58 CL |
460 | if (nsegs < 0) |
461 | return ERR_PTR(nsegs); | |
462 | ||
463 | *iptr++ = xdr_one; /* Reply chunk present */ | |
464 | segcount = iptr++; /* save location of segment count */ | |
465 | ||
466 | nchunks = 0; | |
467 | do { | |
9d6b0409 CL |
468 | n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, |
469 | true, &mw); | |
a54d4059 | 470 | if (n < 0) |
94f58c58 | 471 | return ERR_PTR(n); |
9d6b0409 | 472 | list_add(&mw->mw_list, &req->rl_registered); |
94f58c58 | 473 | |
9d6b0409 | 474 | iptr = xdr_encode_rdma_segment(iptr, mw); |
94f58c58 | 475 | |
9d6b0409 | 476 | dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n", |
94f58c58 | 477 | rqst->rq_task->tk_pid, __func__, |
9d6b0409 CL |
478 | mw->mw_length, (unsigned long long)mw->mw_offset, |
479 | mw->mw_handle, n < nsegs ? "more" : "last"); | |
94f58c58 CL |
480 | |
481 | r_xprt->rx_stats.reply_chunk_count++; | |
482 | r_xprt->rx_stats.total_rdma_request += seg->mr_len; | |
94f58c58 CL |
483 | nchunks++; |
484 | seg += n; | |
485 | nsegs -= n; | |
486 | } while (nsegs); | |
94f58c58 CL |
487 | |
488 | /* Update count of segments in the Reply chunk */ | |
489 | *segcount = cpu_to_be32(nchunks); | |
490 | ||
491 | return iptr; | |
492 | } | |
493 | ||
e9601828 TT |
494 | /* |
495 | * Copy write data inline. | |
496 | * This function is used for "small" requests. Data which is passed | |
497 | * to RPC via iovecs (or page list) is copied directly into the | |
498 | * pre-registered memory buffer for this request. For small amounts | |
499 | * of data, this is efficient. The cutoff value is tunable. | |
500 | */ | |
b3221d6a | 501 | static void rpcrdma_inline_pullup(struct rpc_rqst *rqst) |
e9601828 TT |
502 | { |
503 | int i, npages, curlen; | |
504 | int copy_len; | |
505 | unsigned char *srcp, *destp; | |
506 | struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); | |
bd7ea31b TT |
507 | int page_base; |
508 | struct page **ppages; | |
e9601828 TT |
509 | |
510 | destp = rqst->rq_svec[0].iov_base; | |
511 | curlen = rqst->rq_svec[0].iov_len; | |
512 | destp += curlen; | |
e9601828 | 513 | |
b3221d6a CL |
514 | dprintk("RPC: %s: destp 0x%p len %d hdrlen %d\n", |
515 | __func__, destp, rqst->rq_slen, curlen); | |
e9601828 TT |
516 | |
517 | copy_len = rqst->rq_snd_buf.page_len; | |
b38ab40a TT |
518 | |
519 | if (rqst->rq_snd_buf.tail[0].iov_len) { | |
520 | curlen = rqst->rq_snd_buf.tail[0].iov_len; | |
521 | if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) { | |
522 | memmove(destp + copy_len, | |
523 | rqst->rq_snd_buf.tail[0].iov_base, curlen); | |
524 | r_xprt->rx_stats.pullup_copy_count += curlen; | |
525 | } | |
526 | dprintk("RPC: %s: tail destp 0x%p len %d\n", | |
527 | __func__, destp + copy_len, curlen); | |
528 | rqst->rq_svec[0].iov_len += curlen; | |
529 | } | |
e9601828 | 530 | r_xprt->rx_stats.pullup_copy_count += copy_len; |
bd7ea31b TT |
531 | |
532 | page_base = rqst->rq_snd_buf.page_base; | |
533 | ppages = rqst->rq_snd_buf.pages + (page_base >> PAGE_SHIFT); | |
534 | page_base &= ~PAGE_MASK; | |
535 | npages = PAGE_ALIGN(page_base+copy_len) >> PAGE_SHIFT; | |
e9601828 | 536 | for (i = 0; copy_len && i < npages; i++) { |
bd7ea31b | 537 | curlen = PAGE_SIZE - page_base; |
e9601828 TT |
538 | if (curlen > copy_len) |
539 | curlen = copy_len; | |
540 | dprintk("RPC: %s: page %d destp 0x%p len %d curlen %d\n", | |
541 | __func__, i, destp, copy_len, curlen); | |
b8541786 | 542 | srcp = kmap_atomic(ppages[i]); |
bd7ea31b | 543 | memcpy(destp, srcp+page_base, curlen); |
b8541786 | 544 | kunmap_atomic(srcp); |
e9601828 TT |
545 | rqst->rq_svec[0].iov_len += curlen; |
546 | destp += curlen; | |
547 | copy_len -= curlen; | |
bd7ea31b | 548 | page_base = 0; |
e9601828 | 549 | } |
e9601828 | 550 | /* header now contains entire send message */ |
e9601828 TT |
551 | } |
552 | ||
553 | /* | |
554 | * Marshal a request: the primary job of this routine is to choose | |
555 | * the transfer modes. See comments below. | |
556 | * | |
88b18a12 CL |
557 | * Prepares up to two IOVs per Call message: |
558 | * | |
559 | * [0] -- RPC RDMA header | |
560 | * [1] -- the RPC header/data | |
c93c6223 CL |
561 | * |
562 | * Returns zero on success, otherwise a negative errno. | |
e9601828 TT |
563 | */ |
564 | ||
565 | int | |
566 | rpcrdma_marshal_req(struct rpc_rqst *rqst) | |
567 | { | |
a4f0835c | 568 | struct rpc_xprt *xprt = rqst->rq_xprt; |
e9601828 TT |
569 | struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); |
570 | struct rpcrdma_req *req = rpcr_to_rdmar(rqst); | |
e2377945 | 571 | enum rpcrdma_chunktype rtype, wtype; |
e9601828 | 572 | struct rpcrdma_msg *headerp; |
65b80179 | 573 | bool ddp_allowed; |
94f58c58 CL |
574 | ssize_t hdrlen; |
575 | size_t rpclen; | |
576 | __be32 *iptr; | |
e9601828 | 577 | |
83128a60 CL |
578 | #if defined(CONFIG_SUNRPC_BACKCHANNEL) |
579 | if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state)) | |
580 | return rpcrdma_bc_marshal_reply(rqst); | |
581 | #endif | |
582 | ||
85275c87 | 583 | headerp = rdmab_to_msg(req->rl_rdmabuf); |
284f4902 | 584 | /* don't byte-swap XID, it's already done in request */ |
e9601828 | 585 | headerp->rm_xid = rqst->rq_xid; |
284f4902 CL |
586 | headerp->rm_vers = rpcrdma_version; |
587 | headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); | |
588 | headerp->rm_type = rdma_msg; | |
e9601828 | 589 | |
65b80179 CL |
590 | /* When the ULP employs a GSS flavor that guarantees integrity |
591 | * or privacy, direct data placement of individual data items | |
592 | * is not allowed. | |
593 | */ | |
594 | ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags & | |
595 | RPCAUTH_AUTH_DATATOUCH); | |
596 | ||
e9601828 TT |
597 | /* |
598 | * Chunks needed for results? | |
599 | * | |
600 | * o If the expected result is under the inline threshold, all ops | |
33943b29 | 601 | * return as inline. |
cce6deeb CL |
602 | * o Large read ops return data as write chunk(s), header as |
603 | * inline. | |
e9601828 | 604 | * o Large non-read ops return as a single reply chunk. |
e9601828 | 605 | */ |
cce6deeb | 606 | if (rpcrdma_results_inline(r_xprt, rqst)) |
02eb57d8 | 607 | wtype = rpcrdma_noch; |
65b80179 | 608 | else if (ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ) |
cce6deeb | 609 | wtype = rpcrdma_writech; |
e9601828 | 610 | else |
e2377945 | 611 | wtype = rpcrdma_replych; |
e9601828 TT |
612 | |
613 | /* | |
614 | * Chunks needed for arguments? | |
615 | * | |
616 | * o If the total request is under the inline threshold, all ops | |
617 | * are sent as inline. | |
e9601828 TT |
618 | * o Large write ops transmit data as read chunk(s), header as |
619 | * inline. | |
2fcc213a CL |
620 | * o Large non-write ops are sent with the entire message as a |
621 | * single read chunk (protocol 0-position special case). | |
e9601828 | 622 | * |
2fcc213a CL |
623 | * This assumes that the upper layer does not present a request |
624 | * that both has a data payload, and whose non-data arguments | |
625 | * by themselves are larger than the inline threshold. | |
e9601828 | 626 | */ |
302d3deb | 627 | if (rpcrdma_args_inline(r_xprt, rqst)) { |
e2377945 | 628 | rtype = rpcrdma_noch; |
94f58c58 CL |
629 | rpcrdma_inline_pullup(rqst); |
630 | rpclen = rqst->rq_svec[0].iov_len; | |
65b80179 | 631 | } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { |
e2377945 | 632 | rtype = rpcrdma_readch; |
94f58c58 CL |
633 | rpclen = rqst->rq_svec[0].iov_len; |
634 | rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf); | |
2fcc213a | 635 | } else { |
860477d1 | 636 | r_xprt->rx_stats.nomsg_call_count++; |
2fcc213a CL |
637 | headerp->rm_type = htonl(RDMA_NOMSG); |
638 | rtype = rpcrdma_areadch; | |
639 | rpclen = 0; | |
640 | } | |
e9601828 | 641 | |
94f58c58 CL |
642 | /* This implementation supports the following combinations |
643 | * of chunk lists in one RPC-over-RDMA Call message: | |
644 | * | |
645 | * - Read list | |
646 | * - Write list | |
647 | * - Reply chunk | |
648 | * - Read list + Reply chunk | |
649 | * | |
650 | * It might not yet support the following combinations: | |
651 | * | |
652 | * - Read list + Write list | |
653 | * | |
654 | * It does not support the following combinations: | |
655 | * | |
656 | * - Write list + Reply chunk | |
657 | * - Read list + Write list + Reply chunk | |
658 | * | |
659 | * This implementation supports only a single chunk in each | |
660 | * Read or Write list. Thus for example the client cannot | |
661 | * send a Call message with a Position Zero Read chunk and a | |
662 | * regular Read chunk at the same time. | |
e9601828 | 663 | */ |
94f58c58 CL |
664 | iptr = headerp->rm_body.rm_chunks; |
665 | iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype); | |
666 | if (IS_ERR(iptr)) | |
667 | goto out_unmap; | |
668 | iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype); | |
669 | if (IS_ERR(iptr)) | |
670 | goto out_unmap; | |
671 | iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype); | |
672 | if (IS_ERR(iptr)) | |
673 | goto out_unmap; | |
674 | hdrlen = (unsigned char *)iptr - (unsigned char *)headerp; | |
e9601828 | 675 | |
302d3deb CL |
676 | if (hdrlen + rpclen > RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) |
677 | goto out_overflow; | |
678 | ||
94f58c58 CL |
679 | dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n", |
680 | rqst->rq_task->tk_pid, __func__, | |
681 | transfertypes[rtype], transfertypes[wtype], | |
682 | hdrlen, rpclen); | |
e9601828 | 683 | |
85275c87 | 684 | req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf); |
e9601828 | 685 | req->rl_send_iov[0].length = hdrlen; |
85275c87 | 686 | req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf); |
e9601828 | 687 | |
2fcc213a CL |
688 | req->rl_niovs = 1; |
689 | if (rtype == rpcrdma_areadch) | |
690 | return 0; | |
691 | ||
0ca77dc3 | 692 | req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf); |
e9601828 | 693 | req->rl_send_iov[1].length = rpclen; |
0ca77dc3 | 694 | req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf); |
e9601828 TT |
695 | |
696 | req->rl_niovs = 2; | |
e9601828 | 697 | return 0; |
302d3deb CL |
698 | |
699 | out_overflow: | |
94f58c58 CL |
700 | pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n", |
701 | hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]); | |
9d6b0409 | 702 | iptr = ERR_PTR(-EIO); |
94f58c58 CL |
703 | |
704 | out_unmap: | |
ead3f26e | 705 | r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); |
94f58c58 | 706 | return PTR_ERR(iptr); |
e9601828 TT |
707 | } |
708 | ||
709 | /* | |
710 | * Chase down a received write or reply chunklist to get length | |
711 | * RDMA'd by server. See map at rpcrdma_create_chunks()! :-) | |
712 | */ | |
713 | static int | |
9d6b0409 | 714 | rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp) |
e9601828 TT |
715 | { |
716 | unsigned int i, total_len; | |
717 | struct rpcrdma_write_chunk *cur_wchunk; | |
6b1184cd | 718 | char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf); |
e9601828 | 719 | |
284f4902 | 720 | i = be32_to_cpu(**iptrp); |
e9601828 TT |
721 | cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); |
722 | total_len = 0; | |
723 | while (i--) { | |
724 | struct rpcrdma_segment *seg = &cur_wchunk->wc_target; | |
725 | ifdebug(FACILITY) { | |
726 | u64 off; | |
2d8a9726 | 727 | xdr_decode_hyper((__be32 *)&seg->rs_offset, &off); |
e9601828 TT |
728 | dprintk("RPC: %s: chunk %d@0x%llx:0x%x\n", |
729 | __func__, | |
284f4902 | 730 | be32_to_cpu(seg->rs_length), |
e08a132b | 731 | (unsigned long long)off, |
284f4902 | 732 | be32_to_cpu(seg->rs_handle)); |
e9601828 | 733 | } |
284f4902 | 734 | total_len += be32_to_cpu(seg->rs_length); |
e9601828 TT |
735 | ++cur_wchunk; |
736 | } | |
737 | /* check and adjust for properly terminated write chunk */ | |
738 | if (wrchunk) { | |
2d8a9726 | 739 | __be32 *w = (__be32 *) cur_wchunk; |
e9601828 TT |
740 | if (*w++ != xdr_zero) |
741 | return -1; | |
742 | cur_wchunk = (struct rpcrdma_write_chunk *) w; | |
743 | } | |
6b1184cd | 744 | if ((char *)cur_wchunk > base + rep->rr_len) |
e9601828 TT |
745 | return -1; |
746 | ||
2d8a9726 | 747 | *iptrp = (__be32 *) cur_wchunk; |
e9601828 TT |
748 | return total_len; |
749 | } | |
750 | ||
cb0ae1fb CL |
751 | /** |
752 | * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs | |
753 | * @rqst: controlling RPC request | |
754 | * @srcp: points to RPC message payload in receive buffer | |
755 | * @copy_len: remaining length of receive buffer content | |
756 | * @pad: Write chunk pad bytes needed (zero for pure inline) | |
757 | * | |
758 | * The upper layer has set the maximum number of bytes it can | |
759 | * receive in each component of rq_rcv_buf. These values are set in | |
760 | * the head.iov_len, page_len, tail.iov_len, and buflen fields. | |
cfabe2c6 CL |
761 | * |
762 | * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in | |
763 | * many cases this function simply updates iov_base pointers in | |
764 | * rq_rcv_buf to point directly to the received reply data, to | |
765 | * avoid copying reply data. | |
64695bde CL |
766 | * |
767 | * Returns the count of bytes which had to be memcopied. | |
e9601828 | 768 | */ |
64695bde | 769 | static unsigned long |
9191ca3b | 770 | rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) |
e9601828 | 771 | { |
64695bde CL |
772 | unsigned long fixup_copy_count; |
773 | int i, npages, curlen; | |
e9601828 | 774 | char *destp; |
bd7ea31b TT |
775 | struct page **ppages; |
776 | int page_base; | |
e9601828 | 777 | |
cb0ae1fb CL |
778 | /* The head iovec is redirected to the RPC reply message |
779 | * in the receive buffer, to avoid a memcopy. | |
780 | */ | |
781 | rqst->rq_rcv_buf.head[0].iov_base = srcp; | |
cfabe2c6 | 782 | rqst->rq_private_buf.head[0].iov_base = srcp; |
cb0ae1fb CL |
783 | |
784 | /* The contents of the receive buffer that follow | |
785 | * head.iov_len bytes are copied into the page list. | |
786 | */ | |
e9601828 | 787 | curlen = rqst->rq_rcv_buf.head[0].iov_len; |
cb0ae1fb | 788 | if (curlen > copy_len) |
e9601828 | 789 | curlen = copy_len; |
e9601828 TT |
790 | dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n", |
791 | __func__, srcp, copy_len, curlen); | |
e9601828 TT |
792 | srcp += curlen; |
793 | copy_len -= curlen; | |
794 | ||
bd7ea31b TT |
795 | page_base = rqst->rq_rcv_buf.page_base; |
796 | ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT); | |
797 | page_base &= ~PAGE_MASK; | |
64695bde | 798 | fixup_copy_count = 0; |
e9601828 | 799 | if (copy_len && rqst->rq_rcv_buf.page_len) { |
80414abc CL |
800 | int pagelist_len; |
801 | ||
802 | pagelist_len = rqst->rq_rcv_buf.page_len; | |
803 | if (pagelist_len > copy_len) | |
804 | pagelist_len = copy_len; | |
805 | npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT; | |
64695bde | 806 | for (i = 0; i < npages; i++) { |
bd7ea31b | 807 | curlen = PAGE_SIZE - page_base; |
80414abc CL |
808 | if (curlen > pagelist_len) |
809 | curlen = pagelist_len; | |
810 | ||
e9601828 TT |
811 | dprintk("RPC: %s: page %d" |
812 | " srcp 0x%p len %d curlen %d\n", | |
813 | __func__, i, srcp, copy_len, curlen); | |
b8541786 | 814 | destp = kmap_atomic(ppages[i]); |
bd7ea31b TT |
815 | memcpy(destp + page_base, srcp, curlen); |
816 | flush_dcache_page(ppages[i]); | |
b8541786 | 817 | kunmap_atomic(destp); |
e9601828 TT |
818 | srcp += curlen; |
819 | copy_len -= curlen; | |
64695bde | 820 | fixup_copy_count += curlen; |
80414abc CL |
821 | pagelist_len -= curlen; |
822 | if (!pagelist_len) | |
e9601828 | 823 | break; |
bd7ea31b | 824 | page_base = 0; |
e9601828 | 825 | } |
e9601828 | 826 | |
cb0ae1fb CL |
827 | /* Implicit padding for the last segment in a Write |
828 | * chunk is inserted inline at the front of the tail | |
829 | * iovec. The upper layer ignores the content of | |
830 | * the pad. Simply ensure inline content in the tail | |
831 | * that follows the Write chunk is properly aligned. | |
832 | */ | |
833 | if (pad) | |
834 | srcp -= pad; | |
9191ca3b TT |
835 | } |
836 | ||
cb0ae1fb CL |
837 | /* The tail iovec is redirected to the remaining data |
838 | * in the receive buffer, to avoid a memcopy. | |
839 | */ | |
cfabe2c6 | 840 | if (copy_len || pad) { |
cb0ae1fb | 841 | rqst->rq_rcv_buf.tail[0].iov_base = srcp; |
cfabe2c6 CL |
842 | rqst->rq_private_buf.tail[0].iov_base = srcp; |
843 | } | |
cb0ae1fb | 844 | |
64695bde | 845 | return fixup_copy_count; |
e9601828 TT |
846 | } |
847 | ||
e9601828 | 848 | void |
254f91e2 | 849 | rpcrdma_connect_worker(struct work_struct *work) |
e9601828 | 850 | { |
254f91e2 CL |
851 | struct rpcrdma_ep *ep = |
852 | container_of(work, struct rpcrdma_ep, rep_connect_worker.work); | |
afadc468 CL |
853 | struct rpcrdma_xprt *r_xprt = |
854 | container_of(ep, struct rpcrdma_xprt, rx_ep); | |
855 | struct rpc_xprt *xprt = &r_xprt->rx_xprt; | |
e9601828 TT |
856 | |
857 | spin_lock_bh(&xprt->transport_lock); | |
575448bd TT |
858 | if (++xprt->connect_cookie == 0) /* maintain a reserved value */ |
859 | ++xprt->connect_cookie; | |
e9601828 TT |
860 | if (ep->rep_connected > 0) { |
861 | if (!xprt_test_and_set_connected(xprt)) | |
862 | xprt_wake_pending_tasks(xprt, 0); | |
863 | } else { | |
864 | if (xprt_test_and_clear_connected(xprt)) | |
926449ba | 865 | xprt_wake_pending_tasks(xprt, -ENOTCONN); |
e9601828 TT |
866 | } |
867 | spin_unlock_bh(&xprt->transport_lock); | |
868 | } | |
869 | ||
63cae470 CL |
870 | #if defined(CONFIG_SUNRPC_BACKCHANNEL) |
871 | /* By convention, backchannel calls arrive via rdma_msg type | |
872 | * messages, and never populate the chunk lists. This makes | |
873 | * the RPC/RDMA header small and fixed in size, so it is | |
874 | * straightforward to check the RPC header's direction field. | |
875 | */ | |
876 | static bool | |
877 | rpcrdma_is_bcall(struct rpcrdma_msg *headerp) | |
878 | { | |
879 | __be32 *p = (__be32 *)headerp; | |
880 | ||
881 | if (headerp->rm_type != rdma_msg) | |
882 | return false; | |
883 | if (headerp->rm_body.rm_chunks[0] != xdr_zero) | |
884 | return false; | |
885 | if (headerp->rm_body.rm_chunks[1] != xdr_zero) | |
886 | return false; | |
887 | if (headerp->rm_body.rm_chunks[2] != xdr_zero) | |
888 | return false; | |
889 | ||
890 | /* sanity */ | |
891 | if (p[7] != headerp->rm_xid) | |
892 | return false; | |
893 | /* call direction */ | |
894 | if (p[8] != cpu_to_be32(RPC_CALL)) | |
895 | return false; | |
896 | ||
897 | return true; | |
898 | } | |
899 | #endif /* CONFIG_SUNRPC_BACKCHANNEL */ | |
900 | ||
254f91e2 CL |
901 | /* |
902 | * This function is called when an async event is posted to | |
903 | * the connection which changes the connection state. All it | |
904 | * does at this point is mark the connection up/down, the rpc | |
905 | * timers do the rest. | |
906 | */ | |
907 | void | |
908 | rpcrdma_conn_func(struct rpcrdma_ep *ep) | |
909 | { | |
910 | schedule_delayed_work(&ep->rep_connect_worker, 0); | |
911 | } | |
912 | ||
fe97b47c CL |
913 | /* Process received RPC/RDMA messages. |
914 | * | |
e9601828 TT |
915 | * Errors must result in the RPC task either being awakened, or |
916 | * allowed to timeout, to discover the errors at that time. | |
917 | */ | |
918 | void | |
919 | rpcrdma_reply_handler(struct rpcrdma_rep *rep) | |
920 | { | |
921 | struct rpcrdma_msg *headerp; | |
922 | struct rpcrdma_req *req; | |
923 | struct rpc_rqst *rqst; | |
fed171b3 CL |
924 | struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; |
925 | struct rpc_xprt *xprt = &r_xprt->rx_xprt; | |
2d8a9726 | 926 | __be32 *iptr; |
59aa1f9a | 927 | int rdmalen, status, rmerr; |
e7ce710a | 928 | unsigned long cwnd; |
e9601828 | 929 | |
b0e178a2 CL |
930 | dprintk("RPC: %s: incoming rep %p\n", __func__, rep); |
931 | ||
932 | if (rep->rr_len == RPCRDMA_BAD_LEN) | |
933 | goto out_badstatus; | |
59aa1f9a | 934 | if (rep->rr_len < RPCRDMA_HDRLEN_ERR) |
b0e178a2 CL |
935 | goto out_shortreply; |
936 | ||
6b1184cd | 937 | headerp = rdmab_to_msg(rep->rr_rdmabuf); |
63cae470 CL |
938 | #if defined(CONFIG_SUNRPC_BACKCHANNEL) |
939 | if (rpcrdma_is_bcall(headerp)) | |
940 | goto out_bcall; | |
941 | #endif | |
e9601828 | 942 | |
fe97b47c CL |
943 | /* Match incoming rpcrdma_rep to an rpcrdma_req to |
944 | * get context for handling any incoming chunks. | |
945 | */ | |
946 | spin_lock_bh(&xprt->transport_lock); | |
e9601828 | 947 | rqst = xprt_lookup_rqst(xprt, headerp->rm_xid); |
b0e178a2 CL |
948 | if (!rqst) |
949 | goto out_nomatch; | |
e9601828 | 950 | |
e9601828 | 951 | req = rpcr_to_rdmar(rqst); |
b0e178a2 CL |
952 | if (req->rl_reply) |
953 | goto out_duplicate; | |
e9601828 | 954 | |
68791649 CL |
955 | /* Sanity checking has passed. We are now committed |
956 | * to complete this transaction. | |
957 | */ | |
958 | list_del_init(&rqst->rq_list); | |
959 | spin_unlock_bh(&xprt->transport_lock); | |
af0f16e8 CL |
960 | dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", |
961 | __func__, rep, req, be32_to_cpu(headerp->rm_xid)); | |
e9601828 | 962 | |
e9601828 TT |
963 | /* from here on, the reply is no longer an orphan */ |
964 | req->rl_reply = rep; | |
18906972 | 965 | xprt->reestablish_timeout = 0; |
e9601828 | 966 | |
59aa1f9a CL |
967 | if (headerp->rm_vers != rpcrdma_version) |
968 | goto out_badversion; | |
969 | ||
e9601828 TT |
970 | /* check for expected message types */ |
971 | /* The order of some of these tests is important. */ | |
972 | switch (headerp->rm_type) { | |
284f4902 | 973 | case rdma_msg: |
e9601828 TT |
974 | /* never expect read chunks */ |
975 | /* never expect reply chunks (two ways to check) */ | |
976 | /* never expect write chunks without having offered RDMA */ | |
977 | if (headerp->rm_body.rm_chunks[0] != xdr_zero || | |
978 | (headerp->rm_body.rm_chunks[1] == xdr_zero && | |
979 | headerp->rm_body.rm_chunks[2] != xdr_zero) || | |
980 | (headerp->rm_body.rm_chunks[1] != xdr_zero && | |
9d6b0409 | 981 | list_empty(&req->rl_registered))) |
e9601828 TT |
982 | goto badheader; |
983 | if (headerp->rm_body.rm_chunks[1] != xdr_zero) { | |
984 | /* count any expected write chunks in read reply */ | |
985 | /* start at write chunk array count */ | |
986 | iptr = &headerp->rm_body.rm_chunks[2]; | |
9d6b0409 | 987 | rdmalen = rpcrdma_count_chunks(rep, 1, &iptr); |
e9601828 TT |
988 | /* check for validity, and no reply chunk after */ |
989 | if (rdmalen < 0 || *iptr++ != xdr_zero) | |
990 | goto badheader; | |
991 | rep->rr_len -= | |
992 | ((unsigned char *)iptr - (unsigned char *)headerp); | |
993 | status = rep->rr_len + rdmalen; | |
994 | r_xprt->rx_stats.total_rdma_reply += rdmalen; | |
9191ca3b TT |
995 | /* special case - last chunk may omit padding */ |
996 | if (rdmalen &= 3) { | |
997 | rdmalen = 4 - rdmalen; | |
998 | status += rdmalen; | |
999 | } | |
e9601828 TT |
1000 | } else { |
1001 | /* else ordinary inline */ | |
9191ca3b | 1002 | rdmalen = 0; |
f2846481 CL |
1003 | iptr = (__be32 *)((unsigned char *)headerp + |
1004 | RPCRDMA_HDRLEN_MIN); | |
1005 | rep->rr_len -= RPCRDMA_HDRLEN_MIN; | |
e9601828 TT |
1006 | status = rep->rr_len; |
1007 | } | |
64695bde CL |
1008 | |
1009 | r_xprt->rx_stats.fixup_copy_count += | |
1010 | rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, | |
1011 | rdmalen); | |
e9601828 TT |
1012 | break; |
1013 | ||
284f4902 | 1014 | case rdma_nomsg: |
e9601828 TT |
1015 | /* never expect read or write chunks, always reply chunks */ |
1016 | if (headerp->rm_body.rm_chunks[0] != xdr_zero || | |
1017 | headerp->rm_body.rm_chunks[1] != xdr_zero || | |
1018 | headerp->rm_body.rm_chunks[2] != xdr_one || | |
9d6b0409 | 1019 | list_empty(&req->rl_registered)) |
e9601828 | 1020 | goto badheader; |
f2846481 CL |
1021 | iptr = (__be32 *)((unsigned char *)headerp + |
1022 | RPCRDMA_HDRLEN_MIN); | |
9d6b0409 | 1023 | rdmalen = rpcrdma_count_chunks(rep, 0, &iptr); |
e9601828 TT |
1024 | if (rdmalen < 0) |
1025 | goto badheader; | |
1026 | r_xprt->rx_stats.total_rdma_reply += rdmalen; | |
1027 | /* Reply chunk buffer already is the reply vector - no fixup. */ | |
1028 | status = rdmalen; | |
1029 | break; | |
1030 | ||
59aa1f9a CL |
1031 | case rdma_error: |
1032 | goto out_rdmaerr; | |
1033 | ||
e9601828 TT |
1034 | badheader: |
1035 | default: | |
9d6b0409 CL |
1036 | dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", |
1037 | rqst->rq_task->tk_pid, __func__, | |
1038 | be32_to_cpu(headerp->rm_type)); | |
e9601828 TT |
1039 | status = -EIO; |
1040 | r_xprt->rx_stats.bad_reply_count++; | |
1041 | break; | |
1042 | } | |
1043 | ||
59aa1f9a | 1044 | out: |
68791649 CL |
1045 | /* Invalidate and flush the data payloads before waking the |
1046 | * waiting application. This guarantees the memory region is | |
1047 | * properly fenced from the server before the application | |
1048 | * accesses the data. It also ensures proper send flow | |
1049 | * control: waking the next RPC waits until this RPC has | |
1050 | * relinquished all its Send Queue entries. | |
1051 | */ | |
9d6b0409 | 1052 | if (!list_empty(&req->rl_registered)) |
68791649 CL |
1053 | r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req); |
1054 | ||
68791649 | 1055 | spin_lock_bh(&xprt->transport_lock); |
e7ce710a | 1056 | cwnd = xprt->cwnd; |
23826c7a | 1057 | xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT; |
e7ce710a CL |
1058 | if (xprt->cwnd > cwnd) |
1059 | xprt_release_rqst_cong(rqst->rq_task); | |
1060 | ||
b0e178a2 | 1061 | xprt_complete_rqst(rqst->rq_task, status); |
fe97b47c | 1062 | spin_unlock_bh(&xprt->transport_lock); |
e9601828 TT |
1063 | dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", |
1064 | __func__, xprt, rqst, status); | |
b0e178a2 CL |
1065 | return; |
1066 | ||
1067 | out_badstatus: | |
1068 | rpcrdma_recv_buffer_put(rep); | |
1069 | if (r_xprt->rx_ep.rep_connected == 1) { | |
1070 | r_xprt->rx_ep.rep_connected = -EIO; | |
1071 | rpcrdma_conn_func(&r_xprt->rx_ep); | |
1072 | } | |
1073 | return; | |
1074 | ||
63cae470 CL |
1075 | #if defined(CONFIG_SUNRPC_BACKCHANNEL) |
1076 | out_bcall: | |
1077 | rpcrdma_bc_receive_call(r_xprt, rep); | |
1078 | return; | |
1079 | #endif | |
1080 | ||
59aa1f9a CL |
1081 | /* If the incoming reply terminated a pending RPC, the next |
1082 | * RPC call will post a replacement receive buffer as it is | |
1083 | * being marshaled. | |
1084 | */ | |
b0e178a2 CL |
1085 | out_badversion: |
1086 | dprintk("RPC: %s: invalid version %d\n", | |
1087 | __func__, be32_to_cpu(headerp->rm_vers)); | |
59aa1f9a CL |
1088 | status = -EIO; |
1089 | r_xprt->rx_stats.bad_reply_count++; | |
1090 | goto out; | |
1091 | ||
1092 | out_rdmaerr: | |
1093 | rmerr = be32_to_cpu(headerp->rm_body.rm_error.rm_err); | |
1094 | switch (rmerr) { | |
1095 | case ERR_VERS: | |
1096 | pr_err("%s: server reports header version error (%u-%u)\n", | |
1097 | __func__, | |
1098 | be32_to_cpu(headerp->rm_body.rm_error.rm_vers_low), | |
1099 | be32_to_cpu(headerp->rm_body.rm_error.rm_vers_high)); | |
1100 | break; | |
1101 | case ERR_CHUNK: | |
1102 | pr_err("%s: server reports header decoding error\n", | |
1103 | __func__); | |
1104 | break; | |
1105 | default: | |
1106 | pr_err("%s: server reports unknown error %d\n", | |
1107 | __func__, rmerr); | |
1108 | } | |
1109 | status = -EREMOTEIO; | |
1110 | r_xprt->rx_stats.bad_reply_count++; | |
1111 | goto out; | |
1112 | ||
1113 | /* If no pending RPC transaction was matched, post a replacement | |
1114 | * receive buffer before returning. | |
1115 | */ | |
1116 | out_shortreply: | |
1117 | dprintk("RPC: %s: short/invalid reply\n", __func__); | |
b0e178a2 CL |
1118 | goto repost; |
1119 | ||
1120 | out_nomatch: | |
fe97b47c | 1121 | spin_unlock_bh(&xprt->transport_lock); |
b0e178a2 CL |
1122 | dprintk("RPC: %s: no match for incoming xid 0x%08x len %d\n", |
1123 | __func__, be32_to_cpu(headerp->rm_xid), | |
1124 | rep->rr_len); | |
1125 | goto repost; | |
1126 | ||
1127 | out_duplicate: | |
fe97b47c | 1128 | spin_unlock_bh(&xprt->transport_lock); |
b0e178a2 CL |
1129 | dprintk("RPC: %s: " |
1130 | "duplicate reply %p to RPC request %p: xid 0x%08x\n", | |
1131 | __func__, rep, req, be32_to_cpu(headerp->rm_xid)); | |
1132 | ||
1133 | repost: | |
1134 | r_xprt->rx_stats.bad_reply_count++; | |
1135 | if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep)) | |
1136 | rpcrdma_recv_buffer_put(rep); | |
e9601828 | 1137 | } |