2 * Copyright (c) 2015 Oracle. All rights reserved.
3 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
6 /* Lightweight memory registration using Fast Memory Regions (FMR).
7 * Referred to sometimes as MTHCAFMR mode.
9 * FMR uses synchronous memory registration and deregistration.
10 * FMR registration is known to be fast, but FMR deregistration
11 * can take tens of usecs to complete.
16 * A Memory Region is prepared for RDMA READ or WRITE using the
17 * ib_map_phys_fmr verb (fmr_op_map). When the RDMA operation is
18 * finished, the Memory Region is unmapped using the ib_unmap_fmr
19 * verb (fmr_op_unmap).
22 #include "xprt_rdma.h"
24 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
25 # define RPCDBG_FACILITY RPCDBG_TRANS
28 /* Maximum scatter/gather per FMR */
29 #define RPCRDMA_MAX_FMR_SGES (64)
31 /* Access mode of externally registered pages */
33 RPCRDMA_FMR_ACCESS_FLAGS
= IB_ACCESS_REMOTE_WRITE
|
34 IB_ACCESS_REMOTE_READ
,
38 fmr_is_supported(struct rpcrdma_ia
*ia
)
40 if (!ia
->ri_device
->alloc_fmr
) {
41 pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n",
49 __fmr_init(struct rpcrdma_mw
*mw
, struct ib_pd
*pd
)
51 static struct ib_fmr_attr fmr_attr
= {
52 .max_pages
= RPCRDMA_MAX_FMR_SGES
,
54 .page_shift
= PAGE_SHIFT
57 mw
->fmr
.fm_physaddrs
= kcalloc(RPCRDMA_MAX_FMR_SGES
,
58 sizeof(u64
), GFP_KERNEL
);
59 if (!mw
->fmr
.fm_physaddrs
)
62 mw
->mw_sg
= kcalloc(RPCRDMA_MAX_FMR_SGES
,
63 sizeof(*mw
->mw_sg
), GFP_KERNEL
);
67 sg_init_table(mw
->mw_sg
, RPCRDMA_MAX_FMR_SGES
);
69 mw
->fmr
.fm_mr
= ib_alloc_fmr(pd
, RPCRDMA_FMR_ACCESS_FLAGS
,
71 if (IS_ERR(mw
->fmr
.fm_mr
))
77 dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__
,
78 PTR_ERR(mw
->fmr
.fm_mr
));
82 kfree(mw
->fmr
.fm_physaddrs
);
87 __fmr_unmap(struct rpcrdma_mw
*mw
)
92 list_add(&mw
->fmr
.fm_mr
->list
, &l
);
93 rc
= ib_unmap_fmr(&l
);
94 list_del_init(&mw
->fmr
.fm_mr
->list
);
99 __fmr_release(struct rpcrdma_mw
*r
)
101 LIST_HEAD(unmap_list
);
104 kfree(r
->fmr
.fm_physaddrs
);
107 /* In case this one was left mapped, try to unmap it
108 * to prevent dealloc_fmr from failing with EBUSY
112 pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n",
115 rc
= ib_dealloc_fmr(r
->fmr
.fm_mr
);
117 pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n",
121 /* Reset of a single FMR.
123 * There's no recovery if this fails. The FMR is abandoned, but
124 * remains in rb_all. It will be cleaned up when the transport is
128 fmr_op_recover_mr(struct rpcrdma_mw
*mw
)
130 struct rpcrdma_xprt
*r_xprt
= mw
->mw_xprt
;
133 /* ORDER: invalidate first */
134 rc
= __fmr_unmap(mw
);
136 /* ORDER: then DMA unmap */
137 ib_dma_unmap_sg(r_xprt
->rx_ia
.ri_device
,
138 mw
->mw_sg
, mw
->mw_nents
, mw
->mw_dir
);
140 pr_err("rpcrdma: FMR reset status %d, %p orphaned\n",
142 r_xprt
->rx_stats
.mrs_orphaned
++;
146 rpcrdma_put_mw(r_xprt
, mw
);
147 r_xprt
->rx_stats
.mrs_recovered
++;
151 fmr_op_open(struct rpcrdma_ia
*ia
, struct rpcrdma_ep
*ep
,
152 struct rpcrdma_create_data_internal
*cdata
)
154 rpcrdma_set_max_header_sizes(ia
, cdata
, max_t(unsigned int, 1,
155 RPCRDMA_MAX_DATA_SEGS
/
156 RPCRDMA_MAX_FMR_SGES
));
160 /* FMR mode conveys up to 64 pages of payload per chunk segment.
163 fmr_op_maxpages(struct rpcrdma_xprt
*r_xprt
)
165 return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS
,
166 RPCRDMA_MAX_HDR_SEGS
* RPCRDMA_MAX_FMR_SGES
);
170 fmr_op_init(struct rpcrdma_xprt
*r_xprt
)
172 struct rpcrdma_buffer
*buf
= &r_xprt
->rx_buf
;
173 struct ib_pd
*pd
= r_xprt
->rx_ia
.ri_pd
;
174 struct rpcrdma_mw
*r
;
177 spin_lock_init(&buf
->rb_mwlock
);
178 INIT_LIST_HEAD(&buf
->rb_mws
);
179 INIT_LIST_HEAD(&buf
->rb_all
);
181 i
= max_t(int, RPCRDMA_MAX_DATA_SEGS
/ RPCRDMA_MAX_FMR_SGES
, 1);
182 i
+= 2; /* head + tail */
183 i
*= buf
->rb_max_requests
; /* one set for each RPC slot */
184 dprintk("RPC: %s: initalizing %d FMRs\n", __func__
, i
);
187 r
= kzalloc(sizeof(*r
), GFP_KERNEL
);
191 rc
= __fmr_init(r
, pd
);
198 list_add(&r
->mw_list
, &buf
->rb_mws
);
199 list_add(&r
->mw_all
, &buf
->rb_all
);
204 /* Use the ib_map_phys_fmr() verb to register a memory region
205 * for remote access via RDMA READ or RDMA WRITE.
208 fmr_op_map(struct rpcrdma_xprt
*r_xprt
, struct rpcrdma_mr_seg
*seg
,
209 int nsegs
, bool writing
)
211 struct rpcrdma_mr_seg
*seg1
= seg
;
212 int len
, pageoff
, i
, rc
;
213 struct rpcrdma_mw
*mw
;
219 rpcrdma_defer_mr_recovery(mw
);
220 mw
= rpcrdma_get_mw(r_xprt
);
224 pageoff
= offset_in_page(seg1
->mr_offset
);
225 seg1
->mr_offset
-= pageoff
; /* start of page */
226 seg1
->mr_len
+= pageoff
;
228 if (nsegs
> RPCRDMA_MAX_FMR_SGES
)
229 nsegs
= RPCRDMA_MAX_FMR_SGES
;
230 for (i
= 0; i
< nsegs
;) {
232 sg_set_page(&mw
->mw_sg
[i
],
235 offset_in_page(seg
->mr_offset
));
237 sg_set_buf(&mw
->mw_sg
[i
], seg
->mr_offset
,
242 /* Check for holes */
243 if ((i
< nsegs
&& offset_in_page(seg
->mr_offset
)) ||
244 offset_in_page((seg
-1)->mr_offset
+ (seg
-1)->mr_len
))
248 mw
->mw_dir
= rpcrdma_data_dir(writing
);
252 if (!ib_dma_map_sg(r_xprt
->rx_ia
.ri_device
,
253 mw
->mw_sg
, mw
->mw_nents
, mw
->mw_dir
))
256 for (i
= 0, dma_pages
= mw
->fmr
.fm_physaddrs
; i
< mw
->mw_nents
; i
++)
257 dma_pages
[i
] = sg_dma_address(&mw
->mw_sg
[i
]);
258 rc
= ib_map_phys_fmr(mw
->fmr
.fm_mr
, dma_pages
, mw
->mw_nents
,
264 seg1
->mr_rkey
= mw
->fmr
.fm_mr
->rkey
;
265 seg1
->mr_base
= dma_pages
[0] + pageoff
;
266 seg1
->mr_nsegs
= mw
->mw_nents
;
271 pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n",
272 mw
->mw_sg
, mw
->mw_nents
);
273 rpcrdma_defer_mr_recovery(mw
);
277 pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
278 len
, (unsigned long long)dma_pages
[0],
279 pageoff
, mw
->mw_nents
, rc
);
280 rpcrdma_defer_mr_recovery(mw
);
284 /* Invalidate all memory regions that were registered for "req".
286 * Sleeps until it is safe for the host CPU to access the
287 * previously mapped memory regions.
290 fmr_op_unmap_sync(struct rpcrdma_xprt
*r_xprt
, struct rpcrdma_req
*req
)
292 struct rpcrdma_mr_seg
*seg
;
293 unsigned int i
, nchunks
;
294 struct rpcrdma_mw
*mw
;
295 LIST_HEAD(unmap_list
);
298 dprintk("RPC: %s: req %p\n", __func__
, req
);
300 /* ORDER: Invalidate all of the req's MRs first
302 * ib_unmap_fmr() is slow, so use a single call instead
303 * of one call per mapped FMR.
305 for (i
= 0, nchunks
= req
->rl_nchunks
; nchunks
; nchunks
--) {
306 seg
= &req
->rl_segments
[i
];
309 list_add_tail(&mw
->fmr
.fm_mr
->list
, &unmap_list
);
313 rc
= ib_unmap_fmr(&unmap_list
);
317 /* ORDER: Now DMA unmap all of the req's MRs, and return
318 * them to the free MW list.
320 for (i
= 0, nchunks
= req
->rl_nchunks
; nchunks
; nchunks
--) {
321 seg
= &req
->rl_segments
[i
];
324 list_del_init(&mw
->fmr
.fm_mr
->list
);
325 ib_dma_unmap_sg(r_xprt
->rx_ia
.ri_device
,
326 mw
->mw_sg
, mw
->mw_nents
, mw
->mw_dir
);
327 rpcrdma_put_mw(r_xprt
, mw
);
338 pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc
);
340 for (i
= 0, nchunks
= req
->rl_nchunks
; nchunks
; nchunks
--) {
341 seg
= &req
->rl_segments
[i
];
344 list_del_init(&mw
->fmr
.fm_mr
->list
);
345 fmr_op_recover_mr(mw
);
351 /* Use a slow, safe mechanism to invalidate all memory regions
352 * that were registered for "req".
355 fmr_op_unmap_safe(struct rpcrdma_xprt
*r_xprt
, struct rpcrdma_req
*req
,
358 struct rpcrdma_mr_seg
*seg
;
359 struct rpcrdma_mw
*mw
;
362 for (i
= 0; req
->rl_nchunks
; req
->rl_nchunks
--) {
363 seg
= &req
->rl_segments
[i
];
367 fmr_op_recover_mr(mw
);
369 rpcrdma_defer_mr_recovery(mw
);
378 fmr_op_destroy(struct rpcrdma_buffer
*buf
)
380 struct rpcrdma_mw
*r
;
382 while (!list_empty(&buf
->rb_all
)) {
383 r
= list_entry(buf
->rb_all
.next
, struct rpcrdma_mw
, mw_all
);
384 list_del(&r
->mw_all
);
390 const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops
= {
391 .ro_map
= fmr_op_map
,
392 .ro_unmap_sync
= fmr_op_unmap_sync
,
393 .ro_unmap_safe
= fmr_op_unmap_safe
,
394 .ro_recover_mr
= fmr_op_recover_mr
,
395 .ro_open
= fmr_op_open
,
396 .ro_maxpages
= fmr_op_maxpages
,
397 .ro_init
= fmr_op_init
,
398 .ro_destroy
= fmr_op_destroy
,
399 .ro_displayname
= "fmr",