Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
6a5b99a4 | 18 | * http://www.gnu.org/licenses/gpl-2.0.html |
d7e09d03 | 19 | * |
d7e09d03 PT |
20 | * GPL HEADER END |
21 | */ | |
22 | /* | |
23 | * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. | |
24 | * Use is subject to license terms. | |
25 | * | |
1dc563a6 | 26 | * Copyright (c) 2011, 2015, Intel Corporation. |
d7e09d03 PT |
27 | */ |
28 | /* | |
29 | * This file is part of Lustre, http://www.lustre.org/ | |
30 | * Lustre is a trademark of Sun Microsystems, Inc. | |
31 | */ | |
32 | ||
33 | #define DEBUG_SUBSYSTEM S_RPC | |
e27db149 GKH |
34 | #include "../include/obd_support.h" |
35 | #include "../include/lustre_net.h" | |
36 | #include "../include/lustre_lib.h" | |
37 | #include "../include/obd.h" | |
38 | #include "../include/obd_class.h" | |
d7e09d03 PT |
39 | #include "ptlrpc_internal.h" |
40 | ||
41 | /** | |
42 | * Helper function. Sends \a len bytes from \a base at offset \a offset | |
43 | * over \a conn connection to portal \a portal. | |
44 | * Returns 0 on success or error code. | |
45 | */ | |
3949015e KM |
46 | static int ptl_send_buf(lnet_handle_md_t *mdh, void *base, int len, |
47 | lnet_ack_req_t ack, struct ptlrpc_cb_id *cbid, | |
48 | struct ptlrpc_connection *conn, int portal, __u64 xid, | |
49 | unsigned int offset) | |
d7e09d03 | 50 | { |
d0bfef31 CH |
51 | int rc; |
52 | lnet_md_t md; | |
d7e09d03 | 53 | |
3949015e | 54 | LASSERT(portal != 0); |
3949015e | 55 | CDEBUG(D_INFO, "conn=%p id %s\n", conn, libcfs_id2str(conn->c_peer)); |
d0bfef31 CH |
56 | md.start = base; |
57 | md.length = len; | |
d7e09d03 | 58 | md.threshold = (ack == LNET_ACK_REQ) ? 2 : 1; |
d0bfef31 CH |
59 | md.options = PTLRPC_MD_OPTIONS; |
60 | md.user_ptr = cbid; | |
d7e09d03 PT |
61 | md.eq_handle = ptlrpc_eq_h; |
62 | ||
63 | if (unlikely(ack == LNET_ACK_REQ && | |
cb68dd2d KM |
64 | OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_ACK, |
65 | OBD_FAIL_ONCE))) { | |
d7e09d03 PT |
66 | /* don't ask for the ack to simulate failing client */ |
67 | ack = LNET_NOACK_REQ; | |
68 | } | |
69 | ||
3949015e | 70 | rc = LNetMDBind(md, LNET_UNLINK, mdh); |
d7e09d03 | 71 | if (unlikely(rc != 0)) { |
3949015e KM |
72 | CERROR("LNetMDBind failed: %d\n", rc); |
73 | LASSERT(rc == -ENOMEM); | |
0a3bdb00 | 74 | return -ENOMEM; |
d7e09d03 PT |
75 | } |
76 | ||
f537dd2c | 77 | CDEBUG(D_NET, "Sending %d bytes to portal %d, xid %lld, offset %u\n", |
d7e09d03 PT |
78 | len, portal, xid, offset); |
79 | ||
3949015e KM |
80 | rc = LNetPut(conn->c_self, *mdh, ack, |
81 | conn->c_peer, portal, xid, offset, 0); | |
d7e09d03 PT |
82 | if (unlikely(rc != 0)) { |
83 | int rc2; | |
84 | /* We're going to get an UNLINK event when I unlink below, | |
85 | * which will complete just like any other failed send, so | |
dadfcdab OD |
86 | * I fall through and return success here! |
87 | */ | |
f537dd2c | 88 | CERROR("LNetPut(%s, %d, %lld) failed: %d\n", |
d7e09d03 PT |
89 | libcfs_id2str(conn->c_peer), portal, xid, rc); |
90 | rc2 = LNetMDUnlink(*mdh); | |
91 | LASSERTF(rc2 == 0, "rc2 = %d\n", rc2); | |
92 | } | |
93 | ||
0a3bdb00 | 94 | return 0; |
d7e09d03 PT |
95 | } |
96 | ||
97 | static void mdunlink_iterate_helper(lnet_handle_md_t *bd_mds, int count) | |
98 | { | |
99 | int i; | |
100 | ||
101 | for (i = 0; i < count; i++) | |
102 | LNetMDUnlink(bd_mds[i]); | |
103 | } | |
104 | ||
d7e09d03 PT |
105 | /** |
106 | * Register bulk at the sender for later transfer. | |
107 | * Returns 0 on success or error code. | |
108 | */ | |
12d0be62 | 109 | static int ptlrpc_register_bulk(struct ptlrpc_request *req) |
d7e09d03 PT |
110 | { |
111 | struct ptlrpc_bulk_desc *desc = req->rq_bulk; | |
112 | lnet_process_id_t peer; | |
113 | int rc = 0; | |
114 | int rc2; | |
115 | int posted_md; | |
116 | int total_md; | |
117 | __u64 xid; | |
d0bfef31 CH |
118 | lnet_handle_me_t me_h; |
119 | lnet_md_t md; | |
d7e09d03 PT |
120 | |
121 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_GET_NET)) | |
0a3bdb00 | 122 | return 0; |
d7e09d03 PT |
123 | |
124 | /* NB no locking required until desc is on the network */ | |
125 | LASSERT(desc->bd_nob > 0); | |
126 | LASSERT(desc->bd_md_count == 0); | |
127 | LASSERT(desc->bd_md_max_brw <= PTLRPC_BULK_OPS_COUNT); | |
128 | LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES); | |
8b382089 | 129 | LASSERT(desc->bd_req); |
d7e09d03 PT |
130 | LASSERT(desc->bd_type == BULK_PUT_SINK || |
131 | desc->bd_type == BULK_GET_SOURCE); | |
132 | ||
133 | /* cleanup the state of the bulk for it will be reused */ | |
134 | if (req->rq_resend || req->rq_send_state == LUSTRE_IMP_REPLAY) | |
135 | desc->bd_nob_transferred = 0; | |
136 | else | |
137 | LASSERT(desc->bd_nob_transferred == 0); | |
138 | ||
139 | desc->bd_failure = 0; | |
140 | ||
141 | peer = desc->bd_import->imp_connection->c_peer; | |
142 | ||
143 | LASSERT(desc->bd_cbid.cbid_fn == client_bulk_callback); | |
144 | LASSERT(desc->bd_cbid.cbid_arg == desc); | |
145 | ||
146 | /* An XID is only used for a single request from the client. | |
147 | * For retried bulk transfers, a new XID will be allocated in | |
148 | * in ptlrpc_check_set() if it needs to be resent, so it is not | |
149 | * using the same RDMA match bits after an error. | |
150 | * | |
151 | * For multi-bulk RPCs, rq_xid is the last XID needed for bulks. The | |
dadfcdab OD |
152 | * first bulk XID is power-of-two aligned before rq_xid. LU-1431 |
153 | */ | |
d7e09d03 PT |
154 | xid = req->rq_xid & ~((__u64)desc->bd_md_max_brw - 1); |
155 | LASSERTF(!(desc->bd_registered && | |
156 | req->rq_send_state != LUSTRE_IMP_REPLAY) || | |
157 | xid != desc->bd_last_xid, | |
b0f5aad5 | 158 | "registered: %d rq_xid: %llu bd_last_xid: %llu\n", |
d7e09d03 PT |
159 | desc->bd_registered, xid, desc->bd_last_xid); |
160 | ||
161 | total_md = (desc->bd_iov_count + LNET_MAX_IOV - 1) / LNET_MAX_IOV; | |
162 | desc->bd_registered = 1; | |
163 | desc->bd_last_xid = xid; | |
164 | desc->bd_md_count = total_md; | |
165 | md.user_ptr = &desc->bd_cbid; | |
166 | md.eq_handle = ptlrpc_eq_h; | |
167 | md.threshold = 1; /* PUT or GET */ | |
168 | ||
169 | for (posted_md = 0; posted_md < total_md; posted_md++, xid++) { | |
170 | md.options = PTLRPC_MD_OPTIONS | | |
171 | ((desc->bd_type == BULK_GET_SOURCE) ? | |
172 | LNET_MD_OP_GET : LNET_MD_OP_PUT); | |
173 | ptlrpc_fill_bulk_md(&md, desc, posted_md); | |
174 | ||
175 | rc = LNetMEAttach(desc->bd_portal, peer, xid, 0, | |
176 | LNET_UNLINK, LNET_INS_AFTER, &me_h); | |
177 | if (rc != 0) { | |
b0f5aad5 | 178 | CERROR("%s: LNetMEAttach failed x%llu/%d: rc = %d\n", |
3c92a0bf | 179 | desc->bd_import->imp_obd->obd_name, xid, |
d7e09d03 PT |
180 | posted_md, rc); |
181 | break; | |
182 | } | |
183 | ||
184 | /* About to let the network at it... */ | |
185 | rc = LNetMDAttach(me_h, md, LNET_UNLINK, | |
186 | &desc->bd_mds[posted_md]); | |
187 | if (rc != 0) { | |
b0f5aad5 | 188 | CERROR("%s: LNetMDAttach failed x%llu/%d: rc = %d\n", |
3c92a0bf | 189 | desc->bd_import->imp_obd->obd_name, xid, |
d7e09d03 PT |
190 | posted_md, rc); |
191 | rc2 = LNetMEUnlink(me_h); | |
192 | LASSERT(rc2 == 0); | |
193 | break; | |
194 | } | |
195 | } | |
196 | ||
197 | if (rc != 0) { | |
198 | LASSERT(rc == -ENOMEM); | |
199 | spin_lock(&desc->bd_lock); | |
200 | desc->bd_md_count -= total_md - posted_md; | |
201 | spin_unlock(&desc->bd_lock); | |
202 | LASSERT(desc->bd_md_count >= 0); | |
203 | mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw); | |
204 | req->rq_status = -ENOMEM; | |
0a3bdb00 | 205 | return -ENOMEM; |
d7e09d03 PT |
206 | } |
207 | ||
208 | /* Set rq_xid to matchbits of the final bulk so that server can | |
dadfcdab OD |
209 | * infer the number of bulks that were prepared |
210 | */ | |
d7e09d03 PT |
211 | req->rq_xid = --xid; |
212 | LASSERTF(desc->bd_last_xid == (req->rq_xid & PTLRPC_BULK_OPS_MASK), | |
b0f5aad5 | 213 | "bd_last_xid = x%llu, rq_xid = x%llu\n", |
d7e09d03 PT |
214 | desc->bd_last_xid, req->rq_xid); |
215 | ||
216 | spin_lock(&desc->bd_lock); | |
217 | /* Holler if peer manages to touch buffers before he knows the xid */ | |
218 | if (desc->bd_md_count != total_md) | |
219 | CWARN("%s: Peer %s touched %d buffers while I registered\n", | |
3c92a0bf | 220 | desc->bd_import->imp_obd->obd_name, libcfs_id2str(peer), |
d7e09d03 PT |
221 | total_md - desc->bd_md_count); |
222 | spin_unlock(&desc->bd_lock); | |
223 | ||
2d00bd17 JP |
224 | CDEBUG(D_NET, "Setup %u bulk %s buffers: %u pages %u bytes, xid x%#llx-%#llx, portal %u\n", |
225 | desc->bd_md_count, | |
d7e09d03 PT |
226 | desc->bd_type == BULK_GET_SOURCE ? "get-source" : "put-sink", |
227 | desc->bd_iov_count, desc->bd_nob, | |
228 | desc->bd_last_xid, req->rq_xid, desc->bd_portal); | |
229 | ||
0a3bdb00 | 230 | return 0; |
d7e09d03 | 231 | } |
d7e09d03 PT |
232 | |
233 | /** | |
234 | * Disconnect a bulk desc from the network. Idempotent. Not | |
235 | * thread-safe (i.e. only interlocks with completion callback). | |
236 | * Returns 1 on success or 0 if network unregistration failed for whatever | |
237 | * reason. | |
238 | */ | |
239 | int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async) | |
240 | { | |
241 | struct ptlrpc_bulk_desc *desc = req->rq_bulk; | |
d0bfef31 CH |
242 | wait_queue_head_t *wq; |
243 | struct l_wait_info lwi; | |
244 | int rc; | |
d7e09d03 PT |
245 | |
246 | LASSERT(!in_interrupt()); /* might sleep */ | |
247 | ||
248 | /* Let's setup deadline for reply unlink. */ | |
249 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) && | |
81ea39ec | 250 | async && req->rq_bulk_deadline == 0 && cfs_fail_val == 0) |
219e6de6 | 251 | req->rq_bulk_deadline = ktime_get_real_seconds() + LONG_UNLINK; |
d7e09d03 PT |
252 | |
253 | if (ptlrpc_client_bulk_active(req) == 0) /* completed or */ | |
0a3bdb00 | 254 | return 1; /* never registered */ |
d7e09d03 PT |
255 | |
256 | LASSERT(desc->bd_req == req); /* bd_req NULL until registered */ | |
257 | ||
258 | /* the unlink ensures the callback happens ASAP and is the last | |
259 | * one. If it fails, it must be because completion just happened, | |
260 | * but we must still l_wait_event() in this case to give liblustre | |
dadfcdab OD |
261 | * a chance to run client_bulk_callback() |
262 | */ | |
d7e09d03 PT |
263 | mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw); |
264 | ||
265 | if (ptlrpc_client_bulk_active(req) == 0) /* completed or */ | |
0a3bdb00 | 266 | return 1; /* never registered */ |
d7e09d03 PT |
267 | |
268 | /* Move to "Unregistering" phase as bulk was not unlinked yet. */ | |
81ea39ec | 269 | ptlrpc_rqphase_move(req, RQ_PHASE_UNREG_BULK); |
d7e09d03 PT |
270 | |
271 | /* Do not wait for unlink to finish. */ | |
272 | if (async) | |
0a3bdb00 | 273 | return 0; |
d7e09d03 | 274 | |
8b382089 | 275 | if (req->rq_set) |
d7e09d03 PT |
276 | wq = &req->rq_set->set_waitq; |
277 | else | |
278 | wq = &req->rq_reply_waitq; | |
279 | ||
280 | for (;;) { | |
281 | /* Network access will complete in finite time but the HUGE | |
dadfcdab OD |
282 | * timeout lets us CWARN for visibility of sluggish LNDs |
283 | */ | |
d7e09d03 PT |
284 | lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK), |
285 | cfs_time_seconds(1), NULL, NULL); | |
286 | rc = l_wait_event(*wq, !ptlrpc_client_bulk_active(req), &lwi); | |
287 | if (rc == 0) { | |
288 | ptlrpc_rqphase_move(req, req->rq_next_phase); | |
0a3bdb00 | 289 | return 1; |
d7e09d03 PT |
290 | } |
291 | ||
292 | LASSERT(rc == -ETIMEDOUT); | |
293 | DEBUG_REQ(D_WARNING, req, "Unexpectedly long timeout: desc %p", | |
294 | desc); | |
295 | } | |
0a3bdb00 | 296 | return 0; |
d7e09d03 PT |
297 | } |
298 | EXPORT_SYMBOL(ptlrpc_unregister_bulk); | |
299 | ||
300 | static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags) | |
301 | { | |
d0bfef31 CH |
302 | struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; |
303 | struct ptlrpc_service *svc = svcpt->scp_service; | |
219e6de6 | 304 | int service_time = max_t(int, ktime_get_real_seconds() - |
d7e09d03 PT |
305 | req->rq_arrival_time.tv_sec, 1); |
306 | ||
307 | if (!(flags & PTLRPC_REPLY_EARLY) && | |
8b382089 | 308 | (req->rq_type != PTL_RPC_MSG_ERR) && req->rq_reqmsg && |
d7e09d03 PT |
309 | !(lustre_msg_get_flags(req->rq_reqmsg) & |
310 | (MSG_RESENT | MSG_REPLAY | | |
311 | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE))) { | |
312 | /* early replies, errors and recovery requests don't count | |
dadfcdab OD |
313 | * toward our service time estimate |
314 | */ | |
d7e09d03 PT |
315 | int oldse = at_measured(&svcpt->scp_at_estimate, service_time); |
316 | ||
317 | if (oldse != 0) { | |
318 | DEBUG_REQ(D_ADAPTTO, req, | |
319 | "svc %s changed estimate from %d to %d", | |
320 | svc->srv_name, oldse, | |
321 | at_get(&svcpt->scp_at_estimate)); | |
322 | } | |
323 | } | |
324 | /* Report actual service time for client latency calc */ | |
325 | lustre_msg_set_service_time(req->rq_repmsg, service_time); | |
326 | /* Report service time estimate for future client reqs, but report 0 | |
327 | * (to be ignored by client) if it's a error reply during recovery. | |
dadfcdab OD |
328 | * (bz15815) |
329 | */ | |
af3ec53b | 330 | if (req->rq_type == PTL_RPC_MSG_ERR && !req->rq_export) |
d7e09d03 PT |
331 | lustre_msg_set_timeout(req->rq_repmsg, 0); |
332 | else | |
333 | lustre_msg_set_timeout(req->rq_repmsg, | |
334 | at_get(&svcpt->scp_at_estimate)); | |
335 | ||
336 | if (req->rq_reqmsg && | |
337 | !(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) { | |
2e4fe2bd | 338 | CDEBUG(D_ADAPTTO, "No early reply support: flags=%#x req_flags=%#x magic=%x/%x len=%d\n", |
d7e09d03 | 339 | flags, lustre_msg_get_flags(req->rq_reqmsg), |
d7e09d03 PT |
340 | lustre_msg_get_magic(req->rq_reqmsg), |
341 | lustre_msg_get_magic(req->rq_repmsg), req->rq_replen); | |
342 | } | |
343 | } | |
344 | ||
345 | /** | |
346 | * Send request reply from request \a req reply buffer. | |
347 | * \a flags defines reply types | |
b6da17f3 | 348 | * Returns 0 on success or error code |
d7e09d03 PT |
349 | */ |
350 | int ptlrpc_send_reply(struct ptlrpc_request *req, int flags) | |
351 | { | |
352 | struct ptlrpc_reply_state *rs = req->rq_reply_state; | |
d0bfef31 CH |
353 | struct ptlrpc_connection *conn; |
354 | int rc; | |
d7e09d03 PT |
355 | |
356 | /* We must already have a reply buffer (only ptlrpc_error() may be | |
357 | * called without one). The reply generated by sptlrpc layer (e.g. | |
358 | * error notify, etc.) might have NULL rq->reqmsg; Otherwise we must | |
359 | * have a request buffer which is either the actual (swabbed) incoming | |
360 | * request, or a saved copy if this is a req saved in | |
361 | * target_queue_final_reply(). | |
362 | */ | |
3949015e | 363 | LASSERT(req->rq_no_reply == 0); |
8b382089 OD |
364 | LASSERT(req->rq_reqbuf); |
365 | LASSERT(rs); | |
3949015e | 366 | LASSERT((flags & PTLRPC_REPLY_MAYBE_DIFFICULT) || !rs->rs_difficult); |
8b382089 | 367 | LASSERT(req->rq_repmsg); |
3949015e KM |
368 | LASSERT(req->rq_repmsg == rs->rs_msg); |
369 | LASSERT(rs->rs_cb_id.cbid_fn == reply_out_callback); | |
370 | LASSERT(rs->rs_cb_id.cbid_arg == rs); | |
d7e09d03 PT |
371 | |
372 | /* There may be no rq_export during failover */ | |
373 | ||
374 | if (unlikely(req->rq_export && req->rq_export->exp_obd && | |
375 | req->rq_export->exp_obd->obd_fail)) { | |
376 | /* Failed obd's only send ENODEV */ | |
377 | req->rq_type = PTL_RPC_MSG_ERR; | |
378 | req->rq_status = -ENODEV; | |
379 | CDEBUG(D_HA, "sending ENODEV from failed obd %d\n", | |
380 | req->rq_export->exp_obd->obd_minor); | |
381 | } | |
382 | ||
dfc16973 | 383 | /* In order to keep interoperability with the client (< 2.3) which |
d7e09d03 PT |
384 | * doesn't have pb_jobid in ptlrpc_body, We have to shrink the |
385 | * ptlrpc_body in reply buffer to ptlrpc_body_v2, otherwise, the | |
386 | * reply buffer on client will be overflow. | |
387 | * | |
71474ccb JSO |
388 | * XXX Remove this whenever we drop the interoperability with |
389 | * such client. | |
d7e09d03 PT |
390 | */ |
391 | req->rq_replen = lustre_shrink_msg(req->rq_repmsg, 0, | |
392 | sizeof(struct ptlrpc_body_v2), 1); | |
393 | ||
394 | if (req->rq_type != PTL_RPC_MSG_ERR) | |
395 | req->rq_type = PTL_RPC_MSG_REPLY; | |
396 | ||
397 | lustre_msg_set_type(req->rq_repmsg, req->rq_type); | |
2d58de78 LW |
398 | lustre_msg_set_status(req->rq_repmsg, |
399 | ptlrpc_status_hton(req->rq_status)); | |
d7e09d03 | 400 | lustre_msg_set_opc(req->rq_repmsg, |
24c198e9 OD |
401 | req->rq_reqmsg ? |
402 | lustre_msg_get_opc(req->rq_reqmsg) : 0); | |
d7e09d03 PT |
403 | |
404 | target_pack_pool_reply(req); | |
405 | ||
406 | ptlrpc_at_set_reply(req, flags); | |
407 | ||
8b382089 | 408 | if (!req->rq_export || !req->rq_export->exp_connection) |
d7e09d03 PT |
409 | conn = ptlrpc_connection_get(req->rq_peer, req->rq_self, NULL); |
410 | else | |
411 | conn = ptlrpc_connection_addref(req->rq_export->exp_connection); | |
412 | ||
8b382089 | 413 | if (unlikely(!conn)) { |
d7e09d03 PT |
414 | CERROR("not replying on NULL connection\n"); /* bug 9635 */ |
415 | return -ENOTCONN; | |
416 | } | |
417 | ptlrpc_rs_addref(rs); /* +1 ref for the network */ | |
418 | ||
419 | rc = sptlrpc_svc_wrap_reply(req); | |
420 | if (unlikely(rc)) | |
421 | goto out; | |
422 | ||
219e6de6 | 423 | req->rq_sent = ktime_get_real_seconds(); |
d7e09d03 | 424 | |
3949015e KM |
425 | rc = ptl_send_buf(&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len, |
426 | (rs->rs_difficult && !rs->rs_no_ack) ? | |
427 | LNET_ACK_REQ : LNET_NOACK_REQ, | |
428 | &rs->rs_cb_id, conn, | |
429 | ptlrpc_req2svc(req)->srv_rep_portal, | |
430 | req->rq_xid, req->rq_reply_off); | |
d7e09d03 PT |
431 | out: |
432 | if (unlikely(rc != 0)) | |
433 | ptlrpc_req_drop_rs(req); | |
434 | ptlrpc_connection_put(conn); | |
435 | return rc; | |
436 | } | |
437 | EXPORT_SYMBOL(ptlrpc_send_reply); | |
438 | ||
3949015e | 439 | int ptlrpc_reply(struct ptlrpc_request *req) |
d7e09d03 PT |
440 | { |
441 | if (req->rq_no_reply) | |
442 | return 0; | |
5ce91a9e | 443 | return ptlrpc_send_reply(req, 0); |
d7e09d03 PT |
444 | } |
445 | EXPORT_SYMBOL(ptlrpc_reply); | |
446 | ||
447 | /** | |
448 | * For request \a req send an error reply back. Create empty | |
449 | * reply buffers if necessary. | |
450 | */ | |
451 | int ptlrpc_send_error(struct ptlrpc_request *req, int may_be_difficult) | |
452 | { | |
453 | int rc; | |
d7e09d03 PT |
454 | |
455 | if (req->rq_no_reply) | |
0a3bdb00 | 456 | return 0; |
d7e09d03 PT |
457 | |
458 | if (!req->rq_repmsg) { | |
459 | rc = lustre_pack_reply(req, 1, NULL, NULL); | |
460 | if (rc) | |
0a3bdb00 | 461 | return rc; |
d7e09d03 PT |
462 | } |
463 | ||
464 | if (req->rq_status != -ENOSPC && req->rq_status != -EACCES && | |
465 | req->rq_status != -EPERM && req->rq_status != -ENOENT && | |
466 | req->rq_status != -EINPROGRESS && req->rq_status != -EDQUOT) | |
467 | req->rq_type = PTL_RPC_MSG_ERR; | |
468 | ||
469 | rc = ptlrpc_send_reply(req, may_be_difficult); | |
0a3bdb00 | 470 | return rc; |
d7e09d03 PT |
471 | } |
472 | EXPORT_SYMBOL(ptlrpc_send_error); | |
473 | ||
474 | int ptlrpc_error(struct ptlrpc_request *req) | |
475 | { | |
476 | return ptlrpc_send_error(req, 0); | |
477 | } | |
478 | EXPORT_SYMBOL(ptlrpc_error); | |
479 | ||
480 | /** | |
481 | * Send request \a request. | |
482 | * if \a noreply is set, don't expect any reply back and don't set up | |
483 | * reply buffers. | |
484 | * Returns 0 on success or error code. | |
485 | */ | |
486 | int ptl_send_rpc(struct ptlrpc_request *request, int noreply) | |
487 | { | |
488 | int rc; | |
489 | int rc2; | |
490 | int mpflag = 0; | |
491 | struct ptlrpc_connection *connection; | |
d0bfef31 CH |
492 | lnet_handle_me_t reply_me_h; |
493 | lnet_md_t reply_md; | |
d7e09d03 | 494 | struct obd_device *obd = request->rq_import->imp_obd; |
d7e09d03 PT |
495 | |
496 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC)) | |
0a3bdb00 | 497 | return 0; |
d7e09d03 PT |
498 | |
499 | LASSERT(request->rq_type == PTL_RPC_MSG_REQUEST); | |
500 | LASSERT(request->rq_wait_ctx == 0); | |
501 | ||
502 | /* If this is a re-transmit, we're required to have disengaged | |
dadfcdab OD |
503 | * cleanly from the previous attempt |
504 | */ | |
d7e09d03 | 505 | LASSERT(!request->rq_receiving_reply); |
5c689e68 | 506 | LASSERT(!((lustre_msg_get_flags(request->rq_reqmsg) & MSG_REPLAY) && |
30c0aa39 | 507 | (request->rq_import->imp_state == LUSTRE_IMP_FULL))); |
d7e09d03 | 508 | |
8b382089 | 509 | if (unlikely(obd && obd->obd_fail)) { |
d7e09d03 | 510 | CDEBUG(D_HA, "muting rpc for failed imp obd %s\n", |
30c0aa39 | 511 | obd->obd_name); |
d7e09d03 | 512 | /* this prevents us from waiting in ptlrpc_queue_wait */ |
15c50ccc | 513 | spin_lock(&request->rq_lock); |
d7e09d03 | 514 | request->rq_err = 1; |
15c50ccc | 515 | spin_unlock(&request->rq_lock); |
d7e09d03 | 516 | request->rq_status = -ENODEV; |
0a3bdb00 | 517 | return -ENODEV; |
d7e09d03 PT |
518 | } |
519 | ||
520 | connection = request->rq_import->imp_connection; | |
521 | ||
522 | lustre_msg_set_handle(request->rq_reqmsg, | |
523 | &request->rq_import->imp_remote_handle); | |
524 | lustre_msg_set_type(request->rq_reqmsg, PTL_RPC_MSG_REQUEST); | |
525 | lustre_msg_set_conn_cnt(request->rq_reqmsg, | |
526 | request->rq_import->imp_conn_cnt); | |
527 | lustre_msghdr_set_flags(request->rq_reqmsg, | |
528 | request->rq_import->imp_msghdr_flags); | |
529 | ||
530 | if (request->rq_resend) | |
531 | lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT); | |
532 | ||
533 | if (request->rq_memalloc) | |
534 | mpflag = cfs_memory_pressure_get_and_set(); | |
535 | ||
536 | rc = sptlrpc_cli_wrap_request(request); | |
537 | if (rc) | |
a9b3e8f3 | 538 | goto out; |
d7e09d03 PT |
539 | |
540 | /* bulk register should be done after wrap_request() */ | |
8b382089 | 541 | if (request->rq_bulk) { |
3949015e | 542 | rc = ptlrpc_register_bulk(request); |
d7e09d03 | 543 | if (rc != 0) |
a9b3e8f3 | 544 | goto out; |
d7e09d03 PT |
545 | } |
546 | ||
547 | if (!noreply) { | |
3949015e | 548 | LASSERT(request->rq_replen != 0); |
8b382089 OD |
549 | if (!request->rq_repbuf) { |
550 | LASSERT(!request->rq_repdata); | |
551 | LASSERT(!request->rq_repmsg); | |
d7e09d03 PT |
552 | rc = sptlrpc_cli_alloc_repbuf(request, |
553 | request->rq_replen); | |
554 | if (rc) { | |
555 | /* this prevents us from looping in | |
dadfcdab OD |
556 | * ptlrpc_queue_wait |
557 | */ | |
15c50ccc | 558 | spin_lock(&request->rq_lock); |
d7e09d03 | 559 | request->rq_err = 1; |
15c50ccc | 560 | spin_unlock(&request->rq_lock); |
d7e09d03 | 561 | request->rq_status = rc; |
a9b3e8f3 | 562 | goto cleanup_bulk; |
d7e09d03 PT |
563 | } |
564 | } else { | |
565 | request->rq_repdata = NULL; | |
566 | request->rq_repmsg = NULL; | |
567 | } | |
568 | ||
569 | rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/ | |
570 | connection->c_peer, request->rq_xid, 0, | |
571 | LNET_UNLINK, LNET_INS_AFTER, &reply_me_h); | |
572 | if (rc != 0) { | |
573 | CERROR("LNetMEAttach failed: %d\n", rc); | |
3949015e | 574 | LASSERT(rc == -ENOMEM); |
a9b3e8f3 JL |
575 | rc = -ENOMEM; |
576 | goto cleanup_bulk; | |
d7e09d03 PT |
577 | } |
578 | } | |
579 | ||
580 | spin_lock(&request->rq_lock); | |
d7e09d03 | 581 | /* We are responsible for unlinking the reply buffer */ |
9faa2ade LZ |
582 | request->rq_reply_unlinked = noreply; |
583 | request->rq_receiving_reply = !noreply; | |
d7e09d03 | 584 | /* Clear any flags that may be present from previous sends. */ |
9faa2ade | 585 | request->rq_req_unlinked = 0; |
d7e09d03 PT |
586 | request->rq_replied = 0; |
587 | request->rq_err = 0; | |
588 | request->rq_timedout = 0; | |
589 | request->rq_net_err = 0; | |
590 | request->rq_resend = 0; | |
591 | request->rq_restart = 0; | |
9faa2ade | 592 | request->rq_reply_truncated = 0; |
d7e09d03 PT |
593 | spin_unlock(&request->rq_lock); |
594 | ||
595 | if (!noreply) { | |
d0bfef31 CH |
596 | reply_md.start = request->rq_repbuf; |
597 | reply_md.length = request->rq_repbuf_len; | |
d7e09d03 PT |
598 | /* Allow multiple early replies */ |
599 | reply_md.threshold = LNET_MD_THRESH_INF; | |
600 | /* Manage remote for early replies */ | |
d0bfef31 | 601 | reply_md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | |
d7e09d03 | 602 | LNET_MD_MANAGE_REMOTE | |
7fb7027c | 603 | LNET_MD_TRUNCATE; /* allow to make EOVERFLOW error */ |
d0bfef31 | 604 | reply_md.user_ptr = &request->rq_reply_cbid; |
d7e09d03 PT |
605 | reply_md.eq_handle = ptlrpc_eq_h; |
606 | ||
9faa2ade | 607 | /* We must see the unlink callback to set rq_reply_unlinked, |
dadfcdab OD |
608 | * so we can't auto-unlink |
609 | */ | |
d7e09d03 PT |
610 | rc = LNetMDAttach(reply_me_h, reply_md, LNET_RETAIN, |
611 | &request->rq_reply_md_h); | |
612 | if (rc != 0) { | |
613 | CERROR("LNetMDAttach failed: %d\n", rc); | |
3949015e | 614 | LASSERT(rc == -ENOMEM); |
d7e09d03 PT |
615 | spin_lock(&request->rq_lock); |
616 | /* ...but the MD attach didn't succeed... */ | |
617 | request->rq_receiving_reply = 0; | |
618 | spin_unlock(&request->rq_lock); | |
a9b3e8f3 JL |
619 | rc = -ENOMEM; |
620 | goto cleanup_me; | |
d7e09d03 PT |
621 | } |
622 | ||
b0f5aad5 | 623 | CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid %llu, portal %u\n", |
d7e09d03 PT |
624 | request->rq_repbuf_len, request->rq_xid, |
625 | request->rq_reply_portal); | |
626 | } | |
627 | ||
628 | /* add references on request for request_out_callback */ | |
629 | ptlrpc_request_addref(request); | |
8b382089 | 630 | if (obd && obd->obd_svc_stats) |
d7e09d03 PT |
631 | lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQACTIVE_CNTR, |
632 | atomic_read(&request->rq_import->imp_inflight)); | |
633 | ||
634 | OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5); | |
635 | ||
32c8728d | 636 | ktime_get_real_ts64(&request->rq_sent_tv); |
219e6de6 | 637 | request->rq_sent = ktime_get_real_seconds(); |
d7e09d03 | 638 | /* We give the server rq_timeout secs to process the req, and |
dadfcdab OD |
639 | * add the network latency for our local timeout. |
640 | */ | |
d7e09d03 PT |
641 | request->rq_deadline = request->rq_sent + request->rq_timeout + |
642 | ptlrpc_at_get_net_latency(request); | |
643 | ||
644 | ptlrpc_pinger_sending_on_import(request->rq_import); | |
645 | ||
646 | DEBUG_REQ(D_INFO, request, "send flg=%x", | |
647 | lustre_msg_get_flags(request->rq_reqmsg)); | |
648 | rc = ptl_send_buf(&request->rq_req_md_h, | |
649 | request->rq_reqbuf, request->rq_reqdata_len, | |
650 | LNET_NOACK_REQ, &request->rq_req_cbid, | |
651 | connection, | |
652 | request->rq_request_portal, | |
653 | request->rq_xid, 0); | |
9faa2ade | 654 | if (likely(rc == 0)) |
a9b3e8f3 | 655 | goto out; |
d7e09d03 | 656 | |
9faa2ade | 657 | request->rq_req_unlinked = 1; |
d7e09d03 PT |
658 | ptlrpc_req_finished(request); |
659 | if (noreply) | |
a9b3e8f3 | 660 | goto out; |
d7e09d03 PT |
661 | |
662 | cleanup_me: | |
663 | /* MEUnlink is safe; the PUT didn't even get off the ground, and | |
664 | * nobody apart from the PUT's target has the right nid+XID to | |
dadfcdab OD |
665 | * access the reply buffer. |
666 | */ | |
d7e09d03 | 667 | rc2 = LNetMEUnlink(reply_me_h); |
3949015e | 668 | LASSERT(rc2 == 0); |
d7e09d03 PT |
669 | /* UNLINKED callback called synchronously */ |
670 | LASSERT(!request->rq_receiving_reply); | |
671 | ||
672 | cleanup_bulk: | |
673 | /* We do sync unlink here as there was no real transfer here so | |
dadfcdab OD |
674 | * the chance to have long unlink to sluggish net is smaller here. |
675 | */ | |
d7e09d03 PT |
676 | ptlrpc_unregister_bulk(request, 0); |
677 | out: | |
678 | if (request->rq_memalloc) | |
679 | cfs_memory_pressure_restore(mpflag); | |
680 | return rc; | |
681 | } | |
682 | EXPORT_SYMBOL(ptl_send_rpc); | |
683 | ||
684 | /** | |
685 | * Register request buffer descriptor for request receiving. | |
686 | */ | |
687 | int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd) | |
688 | { | |
d0bfef31 CH |
689 | struct ptlrpc_service *service = rqbd->rqbd_svcpt->scp_service; |
690 | static lnet_process_id_t match_id = {LNET_NID_ANY, LNET_PID_ANY}; | |
691 | int rc; | |
692 | lnet_md_t md; | |
693 | lnet_handle_me_t me_h; | |
d7e09d03 PT |
694 | |
695 | CDEBUG(D_NET, "LNetMEAttach: portal %d\n", | |
696 | service->srv_req_portal); | |
697 | ||
698 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_RQBD)) | |
fbe7c6c7 | 699 | return -ENOMEM; |
d7e09d03 PT |
700 | |
701 | /* NB: CPT affinity service should use new LNet flag LNET_INS_LOCAL, | |
702 | * which means buffer can only be attached on local CPT, and LND | |
dadfcdab OD |
703 | * threads can find it by grabbing a local lock |
704 | */ | |
d7e09d03 PT |
705 | rc = LNetMEAttach(service->srv_req_portal, |
706 | match_id, 0, ~0, LNET_UNLINK, | |
707 | rqbd->rqbd_svcpt->scp_cpt >= 0 ? | |
708 | LNET_INS_LOCAL : LNET_INS_AFTER, &me_h); | |
709 | if (rc != 0) { | |
710 | CERROR("LNetMEAttach failed: %d\n", rc); | |
fbe7c6c7 | 711 | return -ENOMEM; |
d7e09d03 PT |
712 | } |
713 | ||
714 | LASSERT(rqbd->rqbd_refcount == 0); | |
715 | rqbd->rqbd_refcount = 1; | |
716 | ||
d0bfef31 CH |
717 | md.start = rqbd->rqbd_buffer; |
718 | md.length = service->srv_buf_size; | |
719 | md.max_size = service->srv_max_req_size; | |
d7e09d03 | 720 | md.threshold = LNET_MD_THRESH_INF; |
d0bfef31 CH |
721 | md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | LNET_MD_MAX_SIZE; |
722 | md.user_ptr = &rqbd->rqbd_cbid; | |
d7e09d03 PT |
723 | md.eq_handle = ptlrpc_eq_h; |
724 | ||
725 | rc = LNetMDAttach(me_h, md, LNET_UNLINK, &rqbd->rqbd_md_h); | |
726 | if (rc == 0) | |
fbe7c6c7 | 727 | return 0; |
d7e09d03 | 728 | |
998d2766 | 729 | CERROR("LNetMDAttach failed: %d;\n", rc); |
3949015e KM |
730 | LASSERT(rc == -ENOMEM); |
731 | rc = LNetMEUnlink(me_h); | |
732 | LASSERT(rc == 0); | |
d7e09d03 PT |
733 | rqbd->rqbd_refcount = 0; |
734 | ||
fbe7c6c7 | 735 | return -ENOMEM; |
d7e09d03 | 736 | } |