Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
18 | * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf | |
19 | * | |
20 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, | |
21 | * CA 95054 USA or visit www.sun.com if you need additional information or | |
22 | * have any questions. | |
23 | * | |
24 | * GPL HEADER END | |
25 | */ | |
26 | /* | |
27 | * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. | |
28 | * Use is subject to license terms. | |
29 | * | |
30 | * Copyright (c) 2011, 2012, Intel Corporation. | |
31 | */ | |
32 | /* | |
33 | * This file is part of Lustre, http://www.lustre.org/ | |
34 | * Lustre is a trademark of Sun Microsystems, Inc. | |
35 | */ | |
36 | ||
37 | /** Implementation of client-side PortalRPC interfaces */ | |
38 | ||
39 | #define DEBUG_SUBSYSTEM S_RPC | |
40 | ||
41 | #include <obd_support.h> | |
42 | #include <obd_class.h> | |
43 | #include <lustre_lib.h> | |
44 | #include <lustre_ha.h> | |
45 | #include <lustre_import.h> | |
46 | #include <lustre_req_layout.h> | |
47 | ||
48 | #include "ptlrpc_internal.h" | |
49 | ||
50 | static int ptlrpc_send_new_req(struct ptlrpc_request *req); | |
51 | ||
52 | /** | |
53 | * Initialize passed in client structure \a cl. | |
54 | */ | |
55 | void ptlrpc_init_client(int req_portal, int rep_portal, char *name, | |
56 | struct ptlrpc_client *cl) | |
57 | { | |
58 | cl->cli_request_portal = req_portal; | |
59 | cl->cli_reply_portal = rep_portal; | |
60 | cl->cli_name = name; | |
61 | } | |
62 | EXPORT_SYMBOL(ptlrpc_init_client); | |
63 | ||
64 | /** | |
65 | * Return PortalRPC connection for remore uud \a uuid | |
66 | */ | |
67 | struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid) | |
68 | { | |
69 | struct ptlrpc_connection *c; | |
70 | lnet_nid_t self; | |
71 | lnet_process_id_t peer; | |
72 | int err; | |
73 | ||
74 | /* ptlrpc_uuid_to_peer() initializes its 2nd parameter | |
75 | * before accessing its values. */ | |
76 | /* coverity[uninit_use_in_call] */ | |
77 | err = ptlrpc_uuid_to_peer(uuid, &peer, &self); | |
78 | if (err != 0) { | |
79 | CNETERR("cannot find peer %s!\n", uuid->uuid); | |
80 | return NULL; | |
81 | } | |
82 | ||
83 | c = ptlrpc_connection_get(peer, self, uuid); | |
84 | if (c) { | |
85 | memcpy(c->c_remote_uuid.uuid, | |
86 | uuid->uuid, sizeof(c->c_remote_uuid.uuid)); | |
87 | } | |
88 | ||
89 | CDEBUG(D_INFO, "%s -> %p\n", uuid->uuid, c); | |
90 | ||
91 | return c; | |
92 | } | |
93 | EXPORT_SYMBOL(ptlrpc_uuid_to_connection); | |
94 | ||
95 | /** | |
96 | * Allocate and initialize new bulk descriptor on the sender. | |
97 | * Returns pointer to the descriptor or NULL on error. | |
98 | */ | |
99 | struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw, | |
100 | unsigned type, unsigned portal) | |
101 | { | |
102 | struct ptlrpc_bulk_desc *desc; | |
103 | int i; | |
104 | ||
105 | OBD_ALLOC(desc, offsetof(struct ptlrpc_bulk_desc, bd_iov[npages])); | |
106 | if (!desc) | |
107 | return NULL; | |
108 | ||
109 | spin_lock_init(&desc->bd_lock); | |
110 | init_waitqueue_head(&desc->bd_waitq); | |
111 | desc->bd_max_iov = npages; | |
112 | desc->bd_iov_count = 0; | |
113 | desc->bd_portal = portal; | |
114 | desc->bd_type = type; | |
115 | desc->bd_md_count = 0; | |
116 | LASSERT(max_brw > 0); | |
117 | desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT); | |
118 | /* PTLRPC_BULK_OPS_COUNT is the compile-time transfer limit for this | |
119 | * node. Negotiated ocd_brw_size will always be <= this number. */ | |
120 | for (i = 0; i < PTLRPC_BULK_OPS_COUNT; i++) | |
121 | LNetInvalidateHandle(&desc->bd_mds[i]); | |
122 | ||
123 | return desc; | |
124 | } | |
125 | ||
126 | /** | |
127 | * Prepare bulk descriptor for specified outgoing request \a req that | |
128 | * can fit \a npages * pages. \a type is bulk type. \a portal is where | |
129 | * the bulk to be sent. Used on client-side. | |
130 | * Returns pointer to newly allocatrd initialized bulk descriptor or NULL on | |
131 | * error. | |
132 | */ | |
133 | struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req, | |
134 | unsigned npages, unsigned max_brw, | |
135 | unsigned type, unsigned portal) | |
136 | { | |
137 | struct obd_import *imp = req->rq_import; | |
138 | struct ptlrpc_bulk_desc *desc; | |
139 | ||
d7e09d03 PT |
140 | LASSERT(type == BULK_PUT_SINK || type == BULK_GET_SOURCE); |
141 | desc = ptlrpc_new_bulk(npages, max_brw, type, portal); | |
142 | if (desc == NULL) | |
143 | RETURN(NULL); | |
144 | ||
145 | desc->bd_import_generation = req->rq_import_generation; | |
146 | desc->bd_import = class_import_get(imp); | |
147 | desc->bd_req = req; | |
148 | ||
149 | desc->bd_cbid.cbid_fn = client_bulk_callback; | |
150 | desc->bd_cbid.cbid_arg = desc; | |
151 | ||
152 | /* This makes req own desc, and free it when she frees herself */ | |
153 | req->rq_bulk = desc; | |
154 | ||
155 | return desc; | |
156 | } | |
157 | EXPORT_SYMBOL(ptlrpc_prep_bulk_imp); | |
158 | ||
159 | /** | |
160 | * Add a page \a page to the bulk descriptor \a desc. | |
161 | * Data to transfer in the page starts at offset \a pageoffset and | |
162 | * amount of data to transfer from the page is \a len | |
163 | */ | |
164 | void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, | |
165 | struct page *page, int pageoffset, int len, int pin) | |
166 | { | |
167 | LASSERT(desc->bd_iov_count < desc->bd_max_iov); | |
168 | LASSERT(page != NULL); | |
169 | LASSERT(pageoffset >= 0); | |
170 | LASSERT(len > 0); | |
171 | LASSERT(pageoffset + len <= PAGE_CACHE_SIZE); | |
172 | ||
173 | desc->bd_nob += len; | |
174 | ||
175 | if (pin) | |
176 | page_cache_get(page); | |
177 | ||
178 | ptlrpc_add_bulk_page(desc, page, pageoffset, len); | |
179 | } | |
180 | EXPORT_SYMBOL(__ptlrpc_prep_bulk_page); | |
181 | ||
182 | /** | |
183 | * Uninitialize and free bulk descriptor \a desc. | |
184 | * Works on bulk descriptors both from server and client side. | |
185 | */ | |
186 | void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc, int unpin) | |
187 | { | |
188 | int i; | |
d7e09d03 PT |
189 | |
190 | LASSERT(desc != NULL); | |
191 | LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */ | |
192 | LASSERT(desc->bd_md_count == 0); /* network hands off */ | |
193 | LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL)); | |
194 | ||
195 | sptlrpc_enc_pool_put_pages(desc); | |
196 | ||
197 | if (desc->bd_export) | |
198 | class_export_put(desc->bd_export); | |
199 | else | |
200 | class_import_put(desc->bd_import); | |
201 | ||
202 | if (unpin) { | |
203 | for (i = 0; i < desc->bd_iov_count ; i++) | |
204 | page_cache_release(desc->bd_iov[i].kiov_page); | |
205 | } | |
206 | ||
207 | OBD_FREE(desc, offsetof(struct ptlrpc_bulk_desc, | |
208 | bd_iov[desc->bd_max_iov])); | |
d7e09d03 PT |
209 | } |
210 | EXPORT_SYMBOL(__ptlrpc_free_bulk); | |
211 | ||
212 | /** | |
213 | * Set server timelimit for this req, i.e. how long are we willing to wait | |
214 | * for reply before timing out this request. | |
215 | */ | |
216 | void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req) | |
217 | { | |
218 | __u32 serv_est; | |
219 | int idx; | |
220 | struct imp_at *at; | |
221 | ||
222 | LASSERT(req->rq_import); | |
223 | ||
224 | if (AT_OFF) { | |
225 | /* non-AT settings */ | |
226 | /** | |
227 | * \a imp_server_timeout means this is reverse import and | |
228 | * we send (currently only) ASTs to the client and cannot afford | |
229 | * to wait too long for the reply, otherwise the other client | |
230 | * (because of which we are sending this request) would | |
231 | * timeout waiting for us | |
232 | */ | |
233 | req->rq_timeout = req->rq_import->imp_server_timeout ? | |
234 | obd_timeout / 2 : obd_timeout; | |
235 | } else { | |
236 | at = &req->rq_import->imp_at; | |
237 | idx = import_at_get_index(req->rq_import, | |
238 | req->rq_request_portal); | |
239 | serv_est = at_get(&at->iat_service_estimate[idx]); | |
240 | req->rq_timeout = at_est2timeout(serv_est); | |
241 | } | |
242 | /* We could get even fancier here, using history to predict increased | |
243 | loading... */ | |
244 | ||
245 | /* Let the server know what this RPC timeout is by putting it in the | |
246 | reqmsg*/ | |
247 | lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout); | |
248 | } | |
249 | EXPORT_SYMBOL(ptlrpc_at_set_req_timeout); | |
250 | ||
251 | /* Adjust max service estimate based on server value */ | |
252 | static void ptlrpc_at_adj_service(struct ptlrpc_request *req, | |
253 | unsigned int serv_est) | |
254 | { | |
255 | int idx; | |
256 | unsigned int oldse; | |
257 | struct imp_at *at; | |
258 | ||
259 | LASSERT(req->rq_import); | |
260 | at = &req->rq_import->imp_at; | |
261 | ||
262 | idx = import_at_get_index(req->rq_import, req->rq_request_portal); | |
263 | /* max service estimates are tracked on the server side, | |
264 | so just keep minimal history here */ | |
265 | oldse = at_measured(&at->iat_service_estimate[idx], serv_est); | |
266 | if (oldse != 0) | |
267 | CDEBUG(D_ADAPTTO, "The RPC service estimate for %s ptl %d " | |
268 | "has changed from %d to %d\n", | |
269 | req->rq_import->imp_obd->obd_name,req->rq_request_portal, | |
270 | oldse, at_get(&at->iat_service_estimate[idx])); | |
271 | } | |
272 | ||
273 | /* Expected network latency per remote node (secs) */ | |
274 | int ptlrpc_at_get_net_latency(struct ptlrpc_request *req) | |
275 | { | |
276 | return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency); | |
277 | } | |
278 | ||
279 | /* Adjust expected network latency */ | |
280 | static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req, | |
281 | unsigned int service_time) | |
282 | { | |
283 | unsigned int nl, oldnl; | |
284 | struct imp_at *at; | |
285 | time_t now = cfs_time_current_sec(); | |
286 | ||
287 | LASSERT(req->rq_import); | |
288 | at = &req->rq_import->imp_at; | |
289 | ||
290 | /* Network latency is total time less server processing time */ | |
291 | nl = max_t(int, now - req->rq_sent - service_time, 0) +1/*st rounding*/; | |
292 | if (service_time > now - req->rq_sent + 3 /* bz16408 */) | |
293 | CWARN("Reported service time %u > total measured time " | |
294 | CFS_DURATION_T"\n", service_time, | |
295 | cfs_time_sub(now, req->rq_sent)); | |
296 | ||
297 | oldnl = at_measured(&at->iat_net_latency, nl); | |
298 | if (oldnl != 0) | |
299 | CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s) " | |
300 | "has changed from %d to %d\n", | |
301 | req->rq_import->imp_obd->obd_name, | |
302 | obd_uuid2str( | |
303 | &req->rq_import->imp_connection->c_remote_uuid), | |
304 | oldnl, at_get(&at->iat_net_latency)); | |
305 | } | |
306 | ||
307 | static int unpack_reply(struct ptlrpc_request *req) | |
308 | { | |
309 | int rc; | |
310 | ||
311 | if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) { | |
312 | rc = ptlrpc_unpack_rep_msg(req, req->rq_replen); | |
313 | if (rc) { | |
314 | DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc); | |
315 | return(-EPROTO); | |
316 | } | |
317 | } | |
318 | ||
319 | rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); | |
320 | if (rc) { | |
321 | DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc); | |
322 | return(-EPROTO); | |
323 | } | |
324 | return 0; | |
325 | } | |
326 | ||
327 | /** | |
328 | * Handle an early reply message, called with the rq_lock held. | |
329 | * If anything goes wrong just ignore it - same as if it never happened | |
330 | */ | |
331 | static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) | |
332 | { | |
333 | struct ptlrpc_request *early_req; | |
334 | time_t olddl; | |
335 | int rc; | |
d7e09d03 PT |
336 | |
337 | req->rq_early = 0; | |
338 | spin_unlock(&req->rq_lock); | |
339 | ||
340 | rc = sptlrpc_cli_unwrap_early_reply(req, &early_req); | |
341 | if (rc) { | |
342 | spin_lock(&req->rq_lock); | |
343 | RETURN(rc); | |
344 | } | |
345 | ||
346 | rc = unpack_reply(early_req); | |
347 | if (rc == 0) { | |
348 | /* Expecting to increase the service time estimate here */ | |
349 | ptlrpc_at_adj_service(req, | |
350 | lustre_msg_get_timeout(early_req->rq_repmsg)); | |
351 | ptlrpc_at_adj_net_latency(req, | |
352 | lustre_msg_get_service_time(early_req->rq_repmsg)); | |
353 | } | |
354 | ||
355 | sptlrpc_cli_finish_early_reply(early_req); | |
356 | ||
357 | if (rc != 0) { | |
358 | spin_lock(&req->rq_lock); | |
359 | RETURN(rc); | |
360 | } | |
361 | ||
362 | /* Adjust the local timeout for this req */ | |
363 | ptlrpc_at_set_req_timeout(req); | |
364 | ||
365 | spin_lock(&req->rq_lock); | |
366 | olddl = req->rq_deadline; | |
367 | /* server assumes it now has rq_timeout from when it sent the | |
368 | * early reply, so client should give it at least that long. */ | |
369 | req->rq_deadline = cfs_time_current_sec() + req->rq_timeout + | |
370 | ptlrpc_at_get_net_latency(req); | |
371 | ||
372 | DEBUG_REQ(D_ADAPTTO, req, | |
373 | "Early reply #%d, new deadline in "CFS_DURATION_T"s " | |
374 | "("CFS_DURATION_T"s)", req->rq_early_count, | |
375 | cfs_time_sub(req->rq_deadline, cfs_time_current_sec()), | |
376 | cfs_time_sub(req->rq_deadline, olddl)); | |
377 | ||
378 | RETURN(rc); | |
379 | } | |
380 | ||
381 | /** | |
382 | * Wind down request pool \a pool. | |
383 | * Frees all requests from the pool too | |
384 | */ | |
385 | void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool) | |
386 | { | |
387 | struct list_head *l, *tmp; | |
388 | struct ptlrpc_request *req; | |
389 | ||
390 | LASSERT(pool != NULL); | |
391 | ||
392 | spin_lock(&pool->prp_lock); | |
393 | list_for_each_safe(l, tmp, &pool->prp_req_list) { | |
394 | req = list_entry(l, struct ptlrpc_request, rq_list); | |
395 | list_del(&req->rq_list); | |
396 | LASSERT(req->rq_reqbuf); | |
397 | LASSERT(req->rq_reqbuf_len == pool->prp_rq_size); | |
398 | OBD_FREE_LARGE(req->rq_reqbuf, pool->prp_rq_size); | |
399 | OBD_FREE(req, sizeof(*req)); | |
400 | } | |
401 | spin_unlock(&pool->prp_lock); | |
402 | OBD_FREE(pool, sizeof(*pool)); | |
403 | } | |
404 | EXPORT_SYMBOL(ptlrpc_free_rq_pool); | |
405 | ||
406 | /** | |
407 | * Allocates, initializes and adds \a num_rq requests to the pool \a pool | |
408 | */ | |
409 | void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq) | |
410 | { | |
411 | int i; | |
412 | int size = 1; | |
413 | ||
414 | while (size < pool->prp_rq_size) | |
415 | size <<= 1; | |
416 | ||
417 | LASSERTF(list_empty(&pool->prp_req_list) || | |
418 | size == pool->prp_rq_size, | |
419 | "Trying to change pool size with nonempty pool " | |
420 | "from %d to %d bytes\n", pool->prp_rq_size, size); | |
421 | ||
422 | spin_lock(&pool->prp_lock); | |
423 | pool->prp_rq_size = size; | |
424 | for (i = 0; i < num_rq; i++) { | |
425 | struct ptlrpc_request *req; | |
426 | struct lustre_msg *msg; | |
427 | ||
428 | spin_unlock(&pool->prp_lock); | |
429 | OBD_ALLOC(req, sizeof(struct ptlrpc_request)); | |
430 | if (!req) | |
431 | return; | |
432 | OBD_ALLOC_LARGE(msg, size); | |
433 | if (!msg) { | |
434 | OBD_FREE(req, sizeof(struct ptlrpc_request)); | |
435 | return; | |
436 | } | |
437 | req->rq_reqbuf = msg; | |
438 | req->rq_reqbuf_len = size; | |
439 | req->rq_pool = pool; | |
440 | spin_lock(&pool->prp_lock); | |
441 | list_add_tail(&req->rq_list, &pool->prp_req_list); | |
442 | } | |
443 | spin_unlock(&pool->prp_lock); | |
444 | return; | |
445 | } | |
446 | EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool); | |
447 | ||
448 | /** | |
449 | * Create and initialize new request pool with given attributes: | |
450 | * \a num_rq - initial number of requests to create for the pool | |
451 | * \a msgsize - maximum message size possible for requests in thid pool | |
452 | * \a populate_pool - function to be called when more requests need to be added | |
453 | * to the pool | |
454 | * Returns pointer to newly created pool or NULL on error. | |
455 | */ | |
456 | struct ptlrpc_request_pool * | |
457 | ptlrpc_init_rq_pool(int num_rq, int msgsize, | |
458 | void (*populate_pool)(struct ptlrpc_request_pool *, int)) | |
459 | { | |
460 | struct ptlrpc_request_pool *pool; | |
461 | ||
462 | OBD_ALLOC(pool, sizeof (struct ptlrpc_request_pool)); | |
463 | if (!pool) | |
464 | return NULL; | |
465 | ||
466 | /* Request next power of two for the allocation, because internally | |
467 | kernel would do exactly this */ | |
468 | ||
469 | spin_lock_init(&pool->prp_lock); | |
470 | INIT_LIST_HEAD(&pool->prp_req_list); | |
471 | pool->prp_rq_size = msgsize + SPTLRPC_MAX_PAYLOAD; | |
472 | pool->prp_populate = populate_pool; | |
473 | ||
474 | populate_pool(pool, num_rq); | |
475 | ||
476 | if (list_empty(&pool->prp_req_list)) { | |
477 | /* have not allocated a single request for the pool */ | |
478 | OBD_FREE(pool, sizeof (struct ptlrpc_request_pool)); | |
479 | pool = NULL; | |
480 | } | |
481 | return pool; | |
482 | } | |
483 | EXPORT_SYMBOL(ptlrpc_init_rq_pool); | |
484 | ||
485 | /** | |
486 | * Fetches one request from pool \a pool | |
487 | */ | |
488 | static struct ptlrpc_request * | |
489 | ptlrpc_prep_req_from_pool(struct ptlrpc_request_pool *pool) | |
490 | { | |
491 | struct ptlrpc_request *request; | |
492 | struct lustre_msg *reqbuf; | |
493 | ||
494 | if (!pool) | |
495 | return NULL; | |
496 | ||
497 | spin_lock(&pool->prp_lock); | |
498 | ||
499 | /* See if we have anything in a pool, and bail out if nothing, | |
500 | * in writeout path, where this matters, this is safe to do, because | |
501 | * nothing is lost in this case, and when some in-flight requests | |
502 | * complete, this code will be called again. */ | |
503 | if (unlikely(list_empty(&pool->prp_req_list))) { | |
504 | spin_unlock(&pool->prp_lock); | |
505 | return NULL; | |
506 | } | |
507 | ||
508 | request = list_entry(pool->prp_req_list.next, struct ptlrpc_request, | |
509 | rq_list); | |
510 | list_del_init(&request->rq_list); | |
511 | spin_unlock(&pool->prp_lock); | |
512 | ||
513 | LASSERT(request->rq_reqbuf); | |
514 | LASSERT(request->rq_pool); | |
515 | ||
516 | reqbuf = request->rq_reqbuf; | |
517 | memset(request, 0, sizeof(*request)); | |
518 | request->rq_reqbuf = reqbuf; | |
519 | request->rq_reqbuf_len = pool->prp_rq_size; | |
520 | request->rq_pool = pool; | |
521 | ||
522 | return request; | |
523 | } | |
524 | ||
525 | /** | |
526 | * Returns freed \a request to pool. | |
527 | */ | |
528 | static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request) | |
529 | { | |
530 | struct ptlrpc_request_pool *pool = request->rq_pool; | |
531 | ||
532 | spin_lock(&pool->prp_lock); | |
533 | LASSERT(list_empty(&request->rq_list)); | |
534 | LASSERT(!request->rq_receiving_reply); | |
535 | list_add_tail(&request->rq_list, &pool->prp_req_list); | |
536 | spin_unlock(&pool->prp_lock); | |
537 | } | |
538 | ||
539 | static int __ptlrpc_request_bufs_pack(struct ptlrpc_request *request, | |
540 | __u32 version, int opcode, | |
541 | int count, __u32 *lengths, char **bufs, | |
542 | struct ptlrpc_cli_ctx *ctx) | |
543 | { | |
544 | struct obd_import *imp = request->rq_import; | |
545 | int rc; | |
d7e09d03 PT |
546 | |
547 | if (unlikely(ctx)) | |
548 | request->rq_cli_ctx = sptlrpc_cli_ctx_get(ctx); | |
549 | else { | |
550 | rc = sptlrpc_req_get_ctx(request); | |
551 | if (rc) | |
552 | GOTO(out_free, rc); | |
553 | } | |
554 | ||
555 | sptlrpc_req_set_flavor(request, opcode); | |
556 | ||
557 | rc = lustre_pack_request(request, imp->imp_msg_magic, count, | |
558 | lengths, bufs); | |
559 | if (rc) { | |
560 | LASSERT(!request->rq_pool); | |
561 | GOTO(out_ctx, rc); | |
562 | } | |
563 | ||
564 | lustre_msg_add_version(request->rq_reqmsg, version); | |
565 | request->rq_send_state = LUSTRE_IMP_FULL; | |
566 | request->rq_type = PTL_RPC_MSG_REQUEST; | |
567 | request->rq_export = NULL; | |
568 | ||
569 | request->rq_req_cbid.cbid_fn = request_out_callback; | |
570 | request->rq_req_cbid.cbid_arg = request; | |
571 | ||
572 | request->rq_reply_cbid.cbid_fn = reply_in_callback; | |
573 | request->rq_reply_cbid.cbid_arg = request; | |
574 | ||
575 | request->rq_reply_deadline = 0; | |
576 | request->rq_phase = RQ_PHASE_NEW; | |
577 | request->rq_next_phase = RQ_PHASE_UNDEFINED; | |
578 | ||
579 | request->rq_request_portal = imp->imp_client->cli_request_portal; | |
580 | request->rq_reply_portal = imp->imp_client->cli_reply_portal; | |
581 | ||
582 | ptlrpc_at_set_req_timeout(request); | |
583 | ||
584 | spin_lock_init(&request->rq_lock); | |
585 | INIT_LIST_HEAD(&request->rq_list); | |
586 | INIT_LIST_HEAD(&request->rq_timed_list); | |
587 | INIT_LIST_HEAD(&request->rq_replay_list); | |
588 | INIT_LIST_HEAD(&request->rq_ctx_chain); | |
589 | INIT_LIST_HEAD(&request->rq_set_chain); | |
590 | INIT_LIST_HEAD(&request->rq_history_list); | |
591 | INIT_LIST_HEAD(&request->rq_exp_list); | |
592 | init_waitqueue_head(&request->rq_reply_waitq); | |
593 | init_waitqueue_head(&request->rq_set_waitq); | |
594 | request->rq_xid = ptlrpc_next_xid(); | |
595 | atomic_set(&request->rq_refcount, 1); | |
596 | ||
597 | lustre_msg_set_opc(request->rq_reqmsg, opcode); | |
598 | ||
599 | RETURN(0); | |
600 | out_ctx: | |
601 | sptlrpc_cli_ctx_put(request->rq_cli_ctx, 1); | |
602 | out_free: | |
603 | class_import_put(imp); | |
604 | return rc; | |
605 | } | |
606 | ||
607 | int ptlrpc_request_bufs_pack(struct ptlrpc_request *request, | |
608 | __u32 version, int opcode, char **bufs, | |
609 | struct ptlrpc_cli_ctx *ctx) | |
610 | { | |
611 | int count; | |
612 | ||
613 | count = req_capsule_filled_sizes(&request->rq_pill, RCL_CLIENT); | |
614 | return __ptlrpc_request_bufs_pack(request, version, opcode, count, | |
615 | request->rq_pill.rc_area[RCL_CLIENT], | |
616 | bufs, ctx); | |
617 | } | |
618 | EXPORT_SYMBOL(ptlrpc_request_bufs_pack); | |
619 | ||
620 | /** | |
621 | * Pack request buffers for network transfer, performing necessary encryption | |
622 | * steps if necessary. | |
623 | */ | |
624 | int ptlrpc_request_pack(struct ptlrpc_request *request, | |
625 | __u32 version, int opcode) | |
626 | { | |
627 | int rc; | |
628 | rc = ptlrpc_request_bufs_pack(request, version, opcode, NULL, NULL); | |
629 | if (rc) | |
630 | return rc; | |
631 | ||
632 | /* For some old 1.8 clients (< 1.8.7), they will LASSERT the size of | |
633 | * ptlrpc_body sent from server equal to local ptlrpc_body size, so we | |
634 | * have to send old ptlrpc_body to keep interoprability with these | |
635 | * clients. | |
636 | * | |
637 | * Only three kinds of server->client RPCs so far: | |
638 | * - LDLM_BL_CALLBACK | |
639 | * - LDLM_CP_CALLBACK | |
640 | * - LDLM_GL_CALLBACK | |
641 | * | |
642 | * XXX This should be removed whenever we drop the interoprability with | |
643 | * the these old clients. | |
644 | */ | |
645 | if (opcode == LDLM_BL_CALLBACK || opcode == LDLM_CP_CALLBACK || | |
646 | opcode == LDLM_GL_CALLBACK) | |
647 | req_capsule_shrink(&request->rq_pill, &RMF_PTLRPC_BODY, | |
648 | sizeof(struct ptlrpc_body_v2), RCL_CLIENT); | |
649 | ||
650 | return rc; | |
651 | } | |
652 | EXPORT_SYMBOL(ptlrpc_request_pack); | |
653 | ||
654 | /** | |
655 | * Helper function to allocate new request on import \a imp | |
656 | * and possibly using existing request from pool \a pool if provided. | |
657 | * Returns allocated request structure with import field filled or | |
658 | * NULL on error. | |
659 | */ | |
660 | static inline | |
661 | struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp, | |
662 | struct ptlrpc_request_pool *pool) | |
663 | { | |
664 | struct ptlrpc_request *request = NULL; | |
665 | ||
666 | if (pool) | |
667 | request = ptlrpc_prep_req_from_pool(pool); | |
668 | ||
669 | if (!request) | |
670 | OBD_ALLOC_PTR(request); | |
671 | ||
672 | if (request) { | |
673 | LASSERTF((unsigned long)imp > 0x1000, "%p", imp); | |
674 | LASSERT(imp != LP_POISON); | |
675 | LASSERTF((unsigned long)imp->imp_client > 0x1000, "%p", | |
676 | imp->imp_client); | |
677 | LASSERT(imp->imp_client != LP_POISON); | |
678 | ||
679 | request->rq_import = class_import_get(imp); | |
680 | } else { | |
681 | CERROR("request allocation out of memory\n"); | |
682 | } | |
683 | ||
684 | return request; | |
685 | } | |
686 | ||
687 | /** | |
688 | * Helper function for creating a request. | |
689 | * Calls __ptlrpc_request_alloc to allocate new request sturcture and inits | |
690 | * buffer structures according to capsule template \a format. | |
691 | * Returns allocated request structure pointer or NULL on error. | |
692 | */ | |
693 | static struct ptlrpc_request * | |
694 | ptlrpc_request_alloc_internal(struct obd_import *imp, | |
695 | struct ptlrpc_request_pool * pool, | |
696 | const struct req_format *format) | |
697 | { | |
698 | struct ptlrpc_request *request; | |
699 | ||
700 | request = __ptlrpc_request_alloc(imp, pool); | |
701 | if (request == NULL) | |
702 | return NULL; | |
703 | ||
704 | req_capsule_init(&request->rq_pill, request, RCL_CLIENT); | |
705 | req_capsule_set(&request->rq_pill, format); | |
706 | return request; | |
707 | } | |
708 | ||
709 | /** | |
710 | * Allocate new request structure for import \a imp and initialize its | |
711 | * buffer structure according to capsule template \a format. | |
712 | */ | |
713 | struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp, | |
714 | const struct req_format *format) | |
715 | { | |
716 | return ptlrpc_request_alloc_internal(imp, NULL, format); | |
717 | } | |
718 | EXPORT_SYMBOL(ptlrpc_request_alloc); | |
719 | ||
720 | /** | |
721 | * Allocate new request structure for import \a imp from pool \a pool and | |
722 | * initialize its buffer structure according to capsule template \a format. | |
723 | */ | |
724 | struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp, | |
725 | struct ptlrpc_request_pool * pool, | |
726 | const struct req_format *format) | |
727 | { | |
728 | return ptlrpc_request_alloc_internal(imp, pool, format); | |
729 | } | |
730 | EXPORT_SYMBOL(ptlrpc_request_alloc_pool); | |
731 | ||
732 | /** | |
733 | * For requests not from pool, free memory of the request structure. | |
734 | * For requests obtained from a pool earlier, return request back to pool. | |
735 | */ | |
736 | void ptlrpc_request_free(struct ptlrpc_request *request) | |
737 | { | |
738 | if (request->rq_pool) | |
739 | __ptlrpc_free_req_to_pool(request); | |
740 | else | |
741 | OBD_FREE_PTR(request); | |
742 | } | |
743 | EXPORT_SYMBOL(ptlrpc_request_free); | |
744 | ||
745 | /** | |
746 | * Allocate new request for operatione \a opcode and immediatelly pack it for | |
747 | * network transfer. | |
748 | * Only used for simple requests like OBD_PING where the only important | |
749 | * part of the request is operation itself. | |
750 | * Returns allocated request or NULL on error. | |
751 | */ | |
752 | struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp, | |
753 | const struct req_format *format, | |
754 | __u32 version, int opcode) | |
755 | { | |
756 | struct ptlrpc_request *req = ptlrpc_request_alloc(imp, format); | |
757 | int rc; | |
758 | ||
759 | if (req) { | |
760 | rc = ptlrpc_request_pack(req, version, opcode); | |
761 | if (rc) { | |
762 | ptlrpc_request_free(req); | |
763 | req = NULL; | |
764 | } | |
765 | } | |
766 | return req; | |
767 | } | |
768 | EXPORT_SYMBOL(ptlrpc_request_alloc_pack); | |
769 | ||
770 | /** | |
771 | * Prepare request (fetched from pool \a poolif not NULL) on import \a imp | |
772 | * for operation \a opcode. Request would contain \a count buffers. | |
773 | * Sizes of buffers are described in array \a lengths and buffers themselves | |
774 | * are provided by a pointer \a bufs. | |
775 | * Returns prepared request structure pointer or NULL on error. | |
776 | */ | |
777 | struct ptlrpc_request * | |
778 | ptlrpc_prep_req_pool(struct obd_import *imp, | |
779 | __u32 version, int opcode, | |
780 | int count, __u32 *lengths, char **bufs, | |
781 | struct ptlrpc_request_pool *pool) | |
782 | { | |
783 | struct ptlrpc_request *request; | |
784 | int rc; | |
785 | ||
786 | request = __ptlrpc_request_alloc(imp, pool); | |
787 | if (!request) | |
788 | return NULL; | |
789 | ||
790 | rc = __ptlrpc_request_bufs_pack(request, version, opcode, count, | |
791 | lengths, bufs, NULL); | |
792 | if (rc) { | |
793 | ptlrpc_request_free(request); | |
794 | request = NULL; | |
795 | } | |
796 | return request; | |
797 | } | |
798 | EXPORT_SYMBOL(ptlrpc_prep_req_pool); | |
799 | ||
800 | /** | |
801 | * Same as ptlrpc_prep_req_pool, but without pool | |
802 | */ | |
803 | struct ptlrpc_request * | |
804 | ptlrpc_prep_req(struct obd_import *imp, __u32 version, int opcode, int count, | |
805 | __u32 *lengths, char **bufs) | |
806 | { | |
807 | return ptlrpc_prep_req_pool(imp, version, opcode, count, lengths, bufs, | |
808 | NULL); | |
809 | } | |
810 | EXPORT_SYMBOL(ptlrpc_prep_req); | |
811 | ||
812 | /** | |
813 | * Allocate and initialize new request set structure. | |
814 | * Returns a pointer to the newly allocated set structure or NULL on error. | |
815 | */ | |
816 | struct ptlrpc_request_set *ptlrpc_prep_set(void) | |
817 | { | |
818 | struct ptlrpc_request_set *set; | |
819 | ||
d7e09d03 PT |
820 | OBD_ALLOC(set, sizeof *set); |
821 | if (!set) | |
822 | RETURN(NULL); | |
823 | atomic_set(&set->set_refcount, 1); | |
824 | INIT_LIST_HEAD(&set->set_requests); | |
825 | init_waitqueue_head(&set->set_waitq); | |
826 | atomic_set(&set->set_new_count, 0); | |
827 | atomic_set(&set->set_remaining, 0); | |
828 | spin_lock_init(&set->set_new_req_lock); | |
829 | INIT_LIST_HEAD(&set->set_new_requests); | |
830 | INIT_LIST_HEAD(&set->set_cblist); | |
831 | set->set_max_inflight = UINT_MAX; | |
832 | set->set_producer = NULL; | |
833 | set->set_producer_arg = NULL; | |
834 | set->set_rc = 0; | |
835 | ||
836 | RETURN(set); | |
837 | } | |
838 | EXPORT_SYMBOL(ptlrpc_prep_set); | |
839 | ||
840 | /** | |
841 | * Allocate and initialize new request set structure with flow control | |
842 | * extension. This extension allows to control the number of requests in-flight | |
843 | * for the whole set. A callback function to generate requests must be provided | |
844 | * and the request set will keep the number of requests sent over the wire to | |
845 | * @max_inflight. | |
846 | * Returns a pointer to the newly allocated set structure or NULL on error. | |
847 | */ | |
848 | struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func, | |
849 | void *arg) | |
850 | ||
851 | { | |
852 | struct ptlrpc_request_set *set; | |
853 | ||
854 | set = ptlrpc_prep_set(); | |
855 | if (!set) | |
856 | RETURN(NULL); | |
857 | ||
858 | set->set_max_inflight = max; | |
859 | set->set_producer = func; | |
860 | set->set_producer_arg = arg; | |
861 | ||
862 | RETURN(set); | |
863 | } | |
864 | EXPORT_SYMBOL(ptlrpc_prep_fcset); | |
865 | ||
866 | /** | |
867 | * Wind down and free request set structure previously allocated with | |
868 | * ptlrpc_prep_set. | |
869 | * Ensures that all requests on the set have completed and removes | |
870 | * all requests from the request list in a set. | |
871 | * If any unsent request happen to be on the list, pretends that they got | |
872 | * an error in flight and calls their completion handler. | |
873 | */ | |
874 | void ptlrpc_set_destroy(struct ptlrpc_request_set *set) | |
875 | { | |
876 | struct list_head *tmp; | |
877 | struct list_head *next; | |
878 | int expected_phase; | |
879 | int n = 0; | |
d7e09d03 PT |
880 | |
881 | /* Requests on the set should either all be completed, or all be new */ | |
882 | expected_phase = (atomic_read(&set->set_remaining) == 0) ? | |
883 | RQ_PHASE_COMPLETE : RQ_PHASE_NEW; | |
884 | list_for_each (tmp, &set->set_requests) { | |
885 | struct ptlrpc_request *req = | |
886 | list_entry(tmp, struct ptlrpc_request, | |
887 | rq_set_chain); | |
888 | ||
889 | LASSERT(req->rq_phase == expected_phase); | |
890 | n++; | |
891 | } | |
892 | ||
893 | LASSERTF(atomic_read(&set->set_remaining) == 0 || | |
894 | atomic_read(&set->set_remaining) == n, "%d / %d\n", | |
895 | atomic_read(&set->set_remaining), n); | |
896 | ||
897 | list_for_each_safe(tmp, next, &set->set_requests) { | |
898 | struct ptlrpc_request *req = | |
899 | list_entry(tmp, struct ptlrpc_request, | |
900 | rq_set_chain); | |
901 | list_del_init(&req->rq_set_chain); | |
902 | ||
903 | LASSERT(req->rq_phase == expected_phase); | |
904 | ||
905 | if (req->rq_phase == RQ_PHASE_NEW) { | |
906 | ptlrpc_req_interpret(NULL, req, -EBADR); | |
907 | atomic_dec(&set->set_remaining); | |
908 | } | |
909 | ||
910 | spin_lock(&req->rq_lock); | |
911 | req->rq_set = NULL; | |
912 | req->rq_invalid_rqset = 0; | |
913 | spin_unlock(&req->rq_lock); | |
914 | ||
915 | ptlrpc_req_finished (req); | |
916 | } | |
917 | ||
918 | LASSERT(atomic_read(&set->set_remaining) == 0); | |
919 | ||
920 | ptlrpc_reqset_put(set); | |
d7e09d03 PT |
921 | } |
922 | EXPORT_SYMBOL(ptlrpc_set_destroy); | |
923 | ||
924 | /** | |
925 | * Add a callback function \a fn to the set. | |
926 | * This function would be called when all requests on this set are completed. | |
927 | * The function will be passed \a data argument. | |
928 | */ | |
929 | int ptlrpc_set_add_cb(struct ptlrpc_request_set *set, | |
930 | set_interpreter_func fn, void *data) | |
931 | { | |
932 | struct ptlrpc_set_cbdata *cbdata; | |
933 | ||
934 | OBD_ALLOC_PTR(cbdata); | |
935 | if (cbdata == NULL) | |
936 | RETURN(-ENOMEM); | |
937 | ||
938 | cbdata->psc_interpret = fn; | |
939 | cbdata->psc_data = data; | |
940 | list_add_tail(&cbdata->psc_item, &set->set_cblist); | |
941 | ||
942 | RETURN(0); | |
943 | } | |
944 | EXPORT_SYMBOL(ptlrpc_set_add_cb); | |
945 | ||
946 | /** | |
947 | * Add a new request to the general purpose request set. | |
948 | * Assumes request reference from the caller. | |
949 | */ | |
950 | void ptlrpc_set_add_req(struct ptlrpc_request_set *set, | |
951 | struct ptlrpc_request *req) | |
952 | { | |
953 | LASSERT(list_empty(&req->rq_set_chain)); | |
954 | ||
955 | /* The set takes over the caller's request reference */ | |
956 | list_add_tail(&req->rq_set_chain, &set->set_requests); | |
957 | req->rq_set = set; | |
958 | atomic_inc(&set->set_remaining); | |
959 | req->rq_queued_time = cfs_time_current(); | |
960 | ||
961 | if (req->rq_reqmsg != NULL) | |
962 | lustre_msg_set_jobid(req->rq_reqmsg, NULL); | |
963 | ||
964 | if (set->set_producer != NULL) | |
965 | /* If the request set has a producer callback, the RPC must be | |
966 | * sent straight away */ | |
967 | ptlrpc_send_new_req(req); | |
968 | } | |
969 | EXPORT_SYMBOL(ptlrpc_set_add_req); | |
970 | ||
971 | /** | |
972 | * Add a request to a request with dedicated server thread | |
973 | * and wake the thread to make any necessary processing. | |
974 | * Currently only used for ptlrpcd. | |
975 | */ | |
976 | void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc, | |
977 | struct ptlrpc_request *req) | |
978 | { | |
979 | struct ptlrpc_request_set *set = pc->pc_set; | |
980 | int count, i; | |
981 | ||
982 | LASSERT(req->rq_set == NULL); | |
983 | LASSERT(test_bit(LIOD_STOP, &pc->pc_flags) == 0); | |
984 | ||
985 | spin_lock(&set->set_new_req_lock); | |
986 | /* | |
987 | * The set takes over the caller's request reference. | |
988 | */ | |
989 | req->rq_set = set; | |
990 | req->rq_queued_time = cfs_time_current(); | |
991 | list_add_tail(&req->rq_set_chain, &set->set_new_requests); | |
992 | count = atomic_inc_return(&set->set_new_count); | |
993 | spin_unlock(&set->set_new_req_lock); | |
994 | ||
995 | /* Only need to call wakeup once for the first entry. */ | |
996 | if (count == 1) { | |
997 | wake_up(&set->set_waitq); | |
998 | ||
999 | /* XXX: It maybe unnecessary to wakeup all the partners. But to | |
1000 | * guarantee the async RPC can be processed ASAP, we have | |
1001 | * no other better choice. It maybe fixed in future. */ | |
1002 | for (i = 0; i < pc->pc_npartners; i++) | |
1003 | wake_up(&pc->pc_partners[i]->pc_set->set_waitq); | |
1004 | } | |
1005 | } | |
1006 | EXPORT_SYMBOL(ptlrpc_set_add_new_req); | |
1007 | ||
1008 | /** | |
1009 | * Based on the current state of the import, determine if the request | |
1010 | * can be sent, is an error, or should be delayed. | |
1011 | * | |
1012 | * Returns true if this request should be delayed. If false, and | |
1013 | * *status is set, then the request can not be sent and *status is the | |
1014 | * error code. If false and status is 0, then request can be sent. | |
1015 | * | |
1016 | * The imp->imp_lock must be held. | |
1017 | */ | |
1018 | static int ptlrpc_import_delay_req(struct obd_import *imp, | |
1019 | struct ptlrpc_request *req, int *status) | |
1020 | { | |
1021 | int delay = 0; | |
d7e09d03 PT |
1022 | |
1023 | LASSERT (status != NULL); | |
1024 | *status = 0; | |
1025 | ||
1026 | if (req->rq_ctx_init || req->rq_ctx_fini) { | |
1027 | /* always allow ctx init/fini rpc go through */ | |
1028 | } else if (imp->imp_state == LUSTRE_IMP_NEW) { | |
1029 | DEBUG_REQ(D_ERROR, req, "Uninitialized import."); | |
1030 | *status = -EIO; | |
1031 | } else if (imp->imp_state == LUSTRE_IMP_CLOSED) { | |
1032 | /* pings may safely race with umount */ | |
1033 | DEBUG_REQ(lustre_msg_get_opc(req->rq_reqmsg) == OBD_PING ? | |
1034 | D_HA : D_ERROR, req, "IMP_CLOSED "); | |
1035 | *status = -EIO; | |
1036 | } else if (ptlrpc_send_limit_expired(req)) { | |
1037 | /* probably doesn't need to be a D_ERROR after initial testing */ | |
1038 | DEBUG_REQ(D_ERROR, req, "send limit expired "); | |
1039 | *status = -EIO; | |
1040 | } else if (req->rq_send_state == LUSTRE_IMP_CONNECTING && | |
1041 | imp->imp_state == LUSTRE_IMP_CONNECTING) { | |
1042 | /* allow CONNECT even if import is invalid */ ; | |
1043 | if (atomic_read(&imp->imp_inval_count) != 0) { | |
1044 | DEBUG_REQ(D_ERROR, req, "invalidate in flight"); | |
1045 | *status = -EIO; | |
1046 | } | |
1047 | } else if (imp->imp_invalid || imp->imp_obd->obd_no_recov) { | |
1048 | if (!imp->imp_deactive) | |
1049 | DEBUG_REQ(D_NET, req, "IMP_INVALID"); | |
1050 | *status = -ESHUTDOWN; /* bz 12940 */ | |
1051 | } else if (req->rq_import_generation != imp->imp_generation) { | |
1052 | DEBUG_REQ(D_ERROR, req, "req wrong generation:"); | |
1053 | *status = -EIO; | |
1054 | } else if (req->rq_send_state != imp->imp_state) { | |
1055 | /* invalidate in progress - any requests should be drop */ | |
1056 | if (atomic_read(&imp->imp_inval_count) != 0) { | |
1057 | DEBUG_REQ(D_ERROR, req, "invalidate in flight"); | |
1058 | *status = -EIO; | |
1059 | } else if (imp->imp_dlm_fake || req->rq_no_delay) { | |
1060 | *status = -EWOULDBLOCK; | |
1061 | } else if (req->rq_allow_replay && | |
1062 | (imp->imp_state == LUSTRE_IMP_REPLAY || | |
1063 | imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS || | |
1064 | imp->imp_state == LUSTRE_IMP_REPLAY_WAIT || | |
1065 | imp->imp_state == LUSTRE_IMP_RECOVER)) { | |
1066 | DEBUG_REQ(D_HA, req, "allow during recovery.\n"); | |
1067 | } else { | |
1068 | delay = 1; | |
1069 | } | |
1070 | } | |
1071 | ||
1072 | RETURN(delay); | |
1073 | } | |
1074 | ||
1075 | /** | |
1076 | * Decide if the eror message regarding provided request \a req | |
1077 | * should be printed to the console or not. | |
1078 | * Makes it's decision on request status and other properties. | |
1079 | * Returns 1 to print error on the system console or 0 if not. | |
1080 | */ | |
1081 | static int ptlrpc_console_allow(struct ptlrpc_request *req) | |
1082 | { | |
1083 | __u32 opc; | |
1084 | int err; | |
1085 | ||
1086 | LASSERT(req->rq_reqmsg != NULL); | |
1087 | opc = lustre_msg_get_opc(req->rq_reqmsg); | |
1088 | ||
1089 | /* Suppress particular reconnect errors which are to be expected. No | |
1090 | * errors are suppressed for the initial connection on an import */ | |
1091 | if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) && | |
1092 | (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) { | |
1093 | ||
1094 | /* Suppress timed out reconnect requests */ | |
1095 | if (req->rq_timedout) | |
1096 | return 0; | |
1097 | ||
1098 | /* Suppress unavailable/again reconnect requests */ | |
1099 | err = lustre_msg_get_status(req->rq_repmsg); | |
1100 | if (err == -ENODEV || err == -EAGAIN) | |
1101 | return 0; | |
1102 | } | |
1103 | ||
1104 | return 1; | |
1105 | } | |
1106 | ||
1107 | /** | |
1108 | * Check request processing status. | |
1109 | * Returns the status. | |
1110 | */ | |
1111 | static int ptlrpc_check_status(struct ptlrpc_request *req) | |
1112 | { | |
1113 | int err; | |
d7e09d03 PT |
1114 | |
1115 | err = lustre_msg_get_status(req->rq_repmsg); | |
1116 | if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) { | |
1117 | struct obd_import *imp = req->rq_import; | |
1118 | __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); | |
1119 | if (ptlrpc_console_allow(req)) | |
1120 | LCONSOLE_ERROR_MSG(0x011, "%s: Communicating with %s," | |
1121 | " operation %s failed with %d.\n", | |
1122 | imp->imp_obd->obd_name, | |
1123 | libcfs_nid2str( | |
1124 | imp->imp_connection->c_peer.nid), | |
1125 | ll_opcode2str(opc), err); | |
1126 | RETURN(err < 0 ? err : -EINVAL); | |
1127 | } | |
1128 | ||
1129 | if (err < 0) { | |
1130 | DEBUG_REQ(D_INFO, req, "status is %d", err); | |
1131 | } else if (err > 0) { | |
1132 | /* XXX: translate this error from net to host */ | |
1133 | DEBUG_REQ(D_INFO, req, "status is %d", err); | |
1134 | } | |
1135 | ||
1136 | RETURN(err); | |
1137 | } | |
1138 | ||
1139 | /** | |
1140 | * save pre-versions of objects into request for replay. | |
1141 | * Versions are obtained from server reply. | |
1142 | * used for VBR. | |
1143 | */ | |
1144 | static void ptlrpc_save_versions(struct ptlrpc_request *req) | |
1145 | { | |
1146 | struct lustre_msg *repmsg = req->rq_repmsg; | |
1147 | struct lustre_msg *reqmsg = req->rq_reqmsg; | |
1148 | __u64 *versions = lustre_msg_get_versions(repmsg); | |
d7e09d03 PT |
1149 | |
1150 | if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) | |
1151 | return; | |
1152 | ||
1153 | LASSERT(versions); | |
1154 | lustre_msg_set_versions(reqmsg, versions); | |
1155 | CDEBUG(D_INFO, "Client save versions ["LPX64"/"LPX64"]\n", | |
1156 | versions[0], versions[1]); | |
d7e09d03 PT |
1157 | } |
1158 | ||
1159 | /** | |
1160 | * Callback function called when client receives RPC reply for \a req. | |
1161 | * Returns 0 on success or error code. | |
1162 | * The return alue would be assigned to req->rq_status by the caller | |
1163 | * as request processing status. | |
1164 | * This function also decides if the request needs to be saved for later replay. | |
1165 | */ | |
1166 | static int after_reply(struct ptlrpc_request *req) | |
1167 | { | |
1168 | struct obd_import *imp = req->rq_import; | |
1169 | struct obd_device *obd = req->rq_import->imp_obd; | |
1170 | int rc; | |
1171 | struct timeval work_start; | |
1172 | long timediff; | |
d7e09d03 PT |
1173 | |
1174 | LASSERT(obd != NULL); | |
1175 | /* repbuf must be unlinked */ | |
1176 | LASSERT(!req->rq_receiving_reply && !req->rq_must_unlink); | |
1177 | ||
1178 | if (req->rq_reply_truncate) { | |
1179 | if (ptlrpc_no_resend(req)) { | |
1180 | DEBUG_REQ(D_ERROR, req, "reply buffer overflow," | |
1181 | " expected: %d, actual size: %d", | |
1182 | req->rq_nob_received, req->rq_repbuf_len); | |
1183 | RETURN(-EOVERFLOW); | |
1184 | } | |
1185 | ||
1186 | sptlrpc_cli_free_repbuf(req); | |
1187 | /* Pass the required reply buffer size (include | |
1188 | * space for early reply). | |
1189 | * NB: no need to roundup because alloc_repbuf | |
1190 | * will roundup it */ | |
1191 | req->rq_replen = req->rq_nob_received; | |
1192 | req->rq_nob_received = 0; | |
1193 | req->rq_resend = 1; | |
1194 | RETURN(0); | |
1195 | } | |
1196 | ||
1197 | /* | |
1198 | * NB Until this point, the whole of the incoming message, | |
1199 | * including buflens, status etc is in the sender's byte order. | |
1200 | */ | |
1201 | rc = sptlrpc_cli_unwrap_reply(req); | |
1202 | if (rc) { | |
1203 | DEBUG_REQ(D_ERROR, req, "unwrap reply failed (%d):", rc); | |
1204 | RETURN(rc); | |
1205 | } | |
1206 | ||
1207 | /* | |
1208 | * Security layer unwrap might ask resend this request. | |
1209 | */ | |
1210 | if (req->rq_resend) | |
1211 | RETURN(0); | |
1212 | ||
1213 | rc = unpack_reply(req); | |
1214 | if (rc) | |
1215 | RETURN(rc); | |
1216 | ||
1217 | /* retry indefinitely on EINPROGRESS */ | |
1218 | if (lustre_msg_get_status(req->rq_repmsg) == -EINPROGRESS && | |
1219 | ptlrpc_no_resend(req) == 0 && !req->rq_no_retry_einprogress) { | |
1220 | time_t now = cfs_time_current_sec(); | |
1221 | ||
1222 | DEBUG_REQ(D_RPCTRACE, req, "Resending request on EINPROGRESS"); | |
1223 | req->rq_resend = 1; | |
1224 | req->rq_nr_resend++; | |
1225 | ||
1226 | /* allocate new xid to avoid reply reconstruction */ | |
1227 | if (!req->rq_bulk) { | |
1228 | /* new xid is already allocated for bulk in | |
1229 | * ptlrpc_check_set() */ | |
1230 | req->rq_xid = ptlrpc_next_xid(); | |
1231 | DEBUG_REQ(D_RPCTRACE, req, "Allocating new xid for " | |
1232 | "resend on EINPROGRESS"); | |
1233 | } | |
1234 | ||
1235 | /* Readjust the timeout for current conditions */ | |
1236 | ptlrpc_at_set_req_timeout(req); | |
1237 | /* delay resend to give a chance to the server to get ready. | |
1238 | * The delay is increased by 1s on every resend and is capped to | |
1239 | * the current request timeout (i.e. obd_timeout if AT is off, | |
1240 | * or AT service time x 125% + 5s, see at_est2timeout) */ | |
1241 | if (req->rq_nr_resend > req->rq_timeout) | |
1242 | req->rq_sent = now + req->rq_timeout; | |
1243 | else | |
1244 | req->rq_sent = now + req->rq_nr_resend; | |
1245 | ||
1246 | RETURN(0); | |
1247 | } | |
1248 | ||
1249 | do_gettimeofday(&work_start); | |
1250 | timediff = cfs_timeval_sub(&work_start, &req->rq_arrival_time, NULL); | |
1251 | if (obd->obd_svc_stats != NULL) { | |
1252 | lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR, | |
1253 | timediff); | |
1254 | ptlrpc_lprocfs_rpc_sent(req, timediff); | |
1255 | } | |
1256 | ||
1257 | if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY && | |
1258 | lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) { | |
1259 | DEBUG_REQ(D_ERROR, req, "invalid packet received (type=%u)", | |
1260 | lustre_msg_get_type(req->rq_repmsg)); | |
1261 | RETURN(-EPROTO); | |
1262 | } | |
1263 | ||
1264 | if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING) | |
1265 | CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, cfs_fail_val); | |
1266 | ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg)); | |
1267 | ptlrpc_at_adj_net_latency(req, | |
1268 | lustre_msg_get_service_time(req->rq_repmsg)); | |
1269 | ||
1270 | rc = ptlrpc_check_status(req); | |
1271 | imp->imp_connect_error = rc; | |
1272 | ||
1273 | if (rc) { | |
1274 | /* | |
1275 | * Either we've been evicted, or the server has failed for | |
1276 | * some reason. Try to reconnect, and if that fails, punt to | |
1277 | * the upcall. | |
1278 | */ | |
1279 | if (ll_rpc_recoverable_error(rc)) { | |
1280 | if (req->rq_send_state != LUSTRE_IMP_FULL || | |
1281 | imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) { | |
1282 | RETURN(rc); | |
1283 | } | |
1284 | ptlrpc_request_handle_notconn(req); | |
1285 | RETURN(rc); | |
1286 | } | |
1287 | } else { | |
1288 | /* | |
1289 | * Let's look if server sent slv. Do it only for RPC with | |
1290 | * rc == 0. | |
1291 | */ | |
1292 | ldlm_cli_update_pool(req); | |
1293 | } | |
1294 | ||
1295 | /* | |
1296 | * Store transno in reqmsg for replay. | |
1297 | */ | |
1298 | if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) { | |
1299 | req->rq_transno = lustre_msg_get_transno(req->rq_repmsg); | |
1300 | lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno); | |
1301 | } | |
1302 | ||
1303 | if (imp->imp_replayable) { | |
1304 | spin_lock(&imp->imp_lock); | |
1305 | /* | |
1306 | * No point in adding already-committed requests to the replay | |
1307 | * list, we will just remove them immediately. b=9829 | |
1308 | */ | |
1309 | if (req->rq_transno != 0 && | |
1310 | (req->rq_transno > | |
1311 | lustre_msg_get_last_committed(req->rq_repmsg) || | |
1312 | req->rq_replay)) { | |
1313 | /** version recovery */ | |
1314 | ptlrpc_save_versions(req); | |
1315 | ptlrpc_retain_replayable_request(req, imp); | |
1316 | } else if (req->rq_commit_cb != NULL) { | |
1317 | spin_unlock(&imp->imp_lock); | |
1318 | req->rq_commit_cb(req); | |
1319 | spin_lock(&imp->imp_lock); | |
1320 | } | |
1321 | ||
1322 | /* | |
1323 | * Replay-enabled imports return commit-status information. | |
1324 | */ | |
1325 | if (lustre_msg_get_last_committed(req->rq_repmsg)) { | |
1326 | imp->imp_peer_committed_transno = | |
1327 | lustre_msg_get_last_committed(req->rq_repmsg); | |
1328 | } | |
1329 | ||
1330 | ptlrpc_free_committed(imp); | |
1331 | ||
1332 | if (!list_empty(&imp->imp_replay_list)) { | |
1333 | struct ptlrpc_request *last; | |
1334 | ||
1335 | last = list_entry(imp->imp_replay_list.prev, | |
1336 | struct ptlrpc_request, | |
1337 | rq_replay_list); | |
1338 | /* | |
1339 | * Requests with rq_replay stay on the list even if no | |
1340 | * commit is expected. | |
1341 | */ | |
1342 | if (last->rq_transno > imp->imp_peer_committed_transno) | |
1343 | ptlrpc_pinger_commit_expected(imp); | |
1344 | } | |
1345 | ||
1346 | spin_unlock(&imp->imp_lock); | |
1347 | } | |
1348 | ||
1349 | RETURN(rc); | |
1350 | } | |
1351 | ||
1352 | /** | |
1353 | * Helper function to send request \a req over the network for the first time | |
1354 | * Also adjusts request phase. | |
1355 | * Returns 0 on success or error code. | |
1356 | */ | |
1357 | static int ptlrpc_send_new_req(struct ptlrpc_request *req) | |
1358 | { | |
1359 | struct obd_import *imp = req->rq_import; | |
1360 | int rc; | |
d7e09d03 PT |
1361 | |
1362 | LASSERT(req->rq_phase == RQ_PHASE_NEW); | |
1363 | if (req->rq_sent && (req->rq_sent > cfs_time_current_sec()) && | |
1364 | (!req->rq_generation_set || | |
1365 | req->rq_import_generation == imp->imp_generation)) | |
1366 | RETURN (0); | |
1367 | ||
1368 | ptlrpc_rqphase_move(req, RQ_PHASE_RPC); | |
1369 | ||
1370 | spin_lock(&imp->imp_lock); | |
1371 | ||
1372 | if (!req->rq_generation_set) | |
1373 | req->rq_import_generation = imp->imp_generation; | |
1374 | ||
1375 | if (ptlrpc_import_delay_req(imp, req, &rc)) { | |
1376 | spin_lock(&req->rq_lock); | |
1377 | req->rq_waiting = 1; | |
1378 | spin_unlock(&req->rq_lock); | |
1379 | ||
1380 | DEBUG_REQ(D_HA, req, "req from PID %d waiting for recovery: " | |
1381 | "(%s != %s)", lustre_msg_get_status(req->rq_reqmsg), | |
1382 | ptlrpc_import_state_name(req->rq_send_state), | |
1383 | ptlrpc_import_state_name(imp->imp_state)); | |
1384 | LASSERT(list_empty(&req->rq_list)); | |
1385 | list_add_tail(&req->rq_list, &imp->imp_delayed_list); | |
1386 | atomic_inc(&req->rq_import->imp_inflight); | |
1387 | spin_unlock(&imp->imp_lock); | |
1388 | RETURN(0); | |
1389 | } | |
1390 | ||
1391 | if (rc != 0) { | |
1392 | spin_unlock(&imp->imp_lock); | |
1393 | req->rq_status = rc; | |
1394 | ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); | |
1395 | RETURN(rc); | |
1396 | } | |
1397 | ||
1398 | LASSERT(list_empty(&req->rq_list)); | |
1399 | list_add_tail(&req->rq_list, &imp->imp_sending_list); | |
1400 | atomic_inc(&req->rq_import->imp_inflight); | |
1401 | spin_unlock(&imp->imp_lock); | |
1402 | ||
1403 | lustre_msg_set_status(req->rq_reqmsg, current_pid()); | |
1404 | ||
1405 | rc = sptlrpc_req_refresh_ctx(req, -1); | |
1406 | if (rc) { | |
1407 | if (req->rq_err) { | |
1408 | req->rq_status = rc; | |
1409 | RETURN(1); | |
1410 | } else { | |
1411 | req->rq_wait_ctx = 1; | |
1412 | RETURN(0); | |
1413 | } | |
1414 | } | |
1415 | ||
1416 | CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:nid:opc" | |
1417 | " %s:%s:%d:"LPU64":%s:%d\n", current_comm(), | |
1418 | imp->imp_obd->obd_uuid.uuid, | |
1419 | lustre_msg_get_status(req->rq_reqmsg), req->rq_xid, | |
1420 | libcfs_nid2str(imp->imp_connection->c_peer.nid), | |
1421 | lustre_msg_get_opc(req->rq_reqmsg)); | |
1422 | ||
1423 | rc = ptl_send_rpc(req, 0); | |
1424 | if (rc) { | |
1425 | DEBUG_REQ(D_HA, req, "send failed (%d); expect timeout", rc); | |
1426 | req->rq_net_err = 1; | |
1427 | RETURN(rc); | |
1428 | } | |
1429 | RETURN(0); | |
1430 | } | |
1431 | ||
1432 | static inline int ptlrpc_set_producer(struct ptlrpc_request_set *set) | |
1433 | { | |
1434 | int remaining, rc; | |
d7e09d03 PT |
1435 | |
1436 | LASSERT(set->set_producer != NULL); | |
1437 | ||
1438 | remaining = atomic_read(&set->set_remaining); | |
1439 | ||
1440 | /* populate the ->set_requests list with requests until we | |
1441 | * reach the maximum number of RPCs in flight for this set */ | |
1442 | while (atomic_read(&set->set_remaining) < set->set_max_inflight) { | |
1443 | rc = set->set_producer(set, set->set_producer_arg); | |
1444 | if (rc == -ENOENT) { | |
1445 | /* no more RPC to produce */ | |
1446 | set->set_producer = NULL; | |
1447 | set->set_producer_arg = NULL; | |
1448 | RETURN(0); | |
1449 | } | |
1450 | } | |
1451 | ||
1452 | RETURN((atomic_read(&set->set_remaining) - remaining)); | |
1453 | } | |
1454 | ||
1455 | /** | |
1456 | * this sends any unsent RPCs in \a set and returns 1 if all are sent | |
1457 | * and no more replies are expected. | |
1458 | * (it is possible to get less replies than requests sent e.g. due to timed out | |
1459 | * requests or requests that we had trouble to send out) | |
1460 | */ | |
1461 | int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) | |
1462 | { | |
1463 | struct list_head *tmp, *next; | |
1464 | int force_timer_recalc = 0; | |
d7e09d03 PT |
1465 | |
1466 | if (atomic_read(&set->set_remaining) == 0) | |
1467 | RETURN(1); | |
1468 | ||
1469 | list_for_each_safe(tmp, next, &set->set_requests) { | |
1470 | struct ptlrpc_request *req = | |
1471 | list_entry(tmp, struct ptlrpc_request, | |
1472 | rq_set_chain); | |
1473 | struct obd_import *imp = req->rq_import; | |
1474 | int unregistered = 0; | |
1475 | int rc = 0; | |
1476 | ||
1477 | if (req->rq_phase == RQ_PHASE_NEW && | |
1478 | ptlrpc_send_new_req(req)) { | |
1479 | force_timer_recalc = 1; | |
1480 | } | |
1481 | ||
1482 | /* delayed send - skip */ | |
1483 | if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent) | |
1484 | continue; | |
1485 | ||
1486 | /* delayed resend - skip */ | |
1487 | if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend && | |
1488 | req->rq_sent > cfs_time_current_sec()) | |
1489 | continue; | |
1490 | ||
1491 | if (!(req->rq_phase == RQ_PHASE_RPC || | |
1492 | req->rq_phase == RQ_PHASE_BULK || | |
1493 | req->rq_phase == RQ_PHASE_INTERPRET || | |
1494 | req->rq_phase == RQ_PHASE_UNREGISTERING || | |
1495 | req->rq_phase == RQ_PHASE_COMPLETE)) { | |
1496 | DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase); | |
1497 | LBUG(); | |
1498 | } | |
1499 | ||
1500 | if (req->rq_phase == RQ_PHASE_UNREGISTERING) { | |
1501 | LASSERT(req->rq_next_phase != req->rq_phase); | |
1502 | LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED); | |
1503 | ||
1504 | /* | |
1505 | * Skip processing until reply is unlinked. We | |
1506 | * can't return to pool before that and we can't | |
1507 | * call interpret before that. We need to make | |
1508 | * sure that all rdma transfers finished and will | |
1509 | * not corrupt any data. | |
1510 | */ | |
1511 | if (ptlrpc_client_recv_or_unlink(req) || | |
1512 | ptlrpc_client_bulk_active(req)) | |
1513 | continue; | |
1514 | ||
1515 | /* | |
1516 | * Turn fail_loc off to prevent it from looping | |
1517 | * forever. | |
1518 | */ | |
1519 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) { | |
1520 | OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK, | |
1521 | OBD_FAIL_ONCE); | |
1522 | } | |
1523 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) { | |
1524 | OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK, | |
1525 | OBD_FAIL_ONCE); | |
1526 | } | |
1527 | ||
1528 | /* | |
1529 | * Move to next phase if reply was successfully | |
1530 | * unlinked. | |
1531 | */ | |
1532 | ptlrpc_rqphase_move(req, req->rq_next_phase); | |
1533 | } | |
1534 | ||
1535 | if (req->rq_phase == RQ_PHASE_COMPLETE) | |
1536 | continue; | |
1537 | ||
1538 | if (req->rq_phase == RQ_PHASE_INTERPRET) | |
1539 | GOTO(interpret, req->rq_status); | |
1540 | ||
1541 | /* | |
1542 | * Note that this also will start async reply unlink. | |
1543 | */ | |
1544 | if (req->rq_net_err && !req->rq_timedout) { | |
1545 | ptlrpc_expire_one_request(req, 1); | |
1546 | ||
1547 | /* | |
1548 | * Check if we still need to wait for unlink. | |
1549 | */ | |
1550 | if (ptlrpc_client_recv_or_unlink(req) || | |
1551 | ptlrpc_client_bulk_active(req)) | |
1552 | continue; | |
1553 | /* If there is no need to resend, fail it now. */ | |
1554 | if (req->rq_no_resend) { | |
1555 | if (req->rq_status == 0) | |
1556 | req->rq_status = -EIO; | |
1557 | ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); | |
1558 | GOTO(interpret, req->rq_status); | |
1559 | } else { | |
1560 | continue; | |
1561 | } | |
1562 | } | |
1563 | ||
1564 | if (req->rq_err) { | |
1565 | spin_lock(&req->rq_lock); | |
1566 | req->rq_replied = 0; | |
1567 | spin_unlock(&req->rq_lock); | |
1568 | if (req->rq_status == 0) | |
1569 | req->rq_status = -EIO; | |
1570 | ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); | |
1571 | GOTO(interpret, req->rq_status); | |
1572 | } | |
1573 | ||
1574 | /* ptlrpc_set_wait->l_wait_event sets lwi_allow_intr | |
1575 | * so it sets rq_intr regardless of individual rpc | |
1576 | * timeouts. The synchronous IO waiting path sets | |
1577 | * rq_intr irrespective of whether ptlrpcd | |
1578 | * has seen a timeout. Our policy is to only interpret | |
1579 | * interrupted rpcs after they have timed out, so we | |
1580 | * need to enforce that here. | |
1581 | */ | |
1582 | ||
1583 | if (req->rq_intr && (req->rq_timedout || req->rq_waiting || | |
1584 | req->rq_wait_ctx)) { | |
1585 | req->rq_status = -EINTR; | |
1586 | ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); | |
1587 | GOTO(interpret, req->rq_status); | |
1588 | } | |
1589 | ||
1590 | if (req->rq_phase == RQ_PHASE_RPC) { | |
1591 | if (req->rq_timedout || req->rq_resend || | |
1592 | req->rq_waiting || req->rq_wait_ctx) { | |
1593 | int status; | |
1594 | ||
1595 | if (!ptlrpc_unregister_reply(req, 1)) | |
1596 | continue; | |
1597 | ||
1598 | spin_lock(&imp->imp_lock); | |
1599 | if (ptlrpc_import_delay_req(imp, req, &status)){ | |
1600 | /* put on delay list - only if we wait | |
1601 | * recovery finished - before send */ | |
1602 | list_del_init(&req->rq_list); | |
1603 | list_add_tail(&req->rq_list, | |
1604 | &imp-> | |
1605 | imp_delayed_list); | |
1606 | spin_unlock(&imp->imp_lock); | |
1607 | continue; | |
1608 | } | |
1609 | ||
1610 | if (status != 0) { | |
1611 | req->rq_status = status; | |
1612 | ptlrpc_rqphase_move(req, | |
1613 | RQ_PHASE_INTERPRET); | |
1614 | spin_unlock(&imp->imp_lock); | |
1615 | GOTO(interpret, req->rq_status); | |
1616 | } | |
1617 | if (ptlrpc_no_resend(req) && | |
1618 | !req->rq_wait_ctx) { | |
1619 | req->rq_status = -ENOTCONN; | |
1620 | ptlrpc_rqphase_move(req, | |
1621 | RQ_PHASE_INTERPRET); | |
1622 | spin_unlock(&imp->imp_lock); | |
1623 | GOTO(interpret, req->rq_status); | |
1624 | } | |
1625 | ||
1626 | list_del_init(&req->rq_list); | |
1627 | list_add_tail(&req->rq_list, | |
1628 | &imp->imp_sending_list); | |
1629 | ||
1630 | spin_unlock(&imp->imp_lock); | |
1631 | ||
1632 | spin_lock(&req->rq_lock); | |
1633 | req->rq_waiting = 0; | |
1634 | spin_unlock(&req->rq_lock); | |
1635 | ||
1636 | if (req->rq_timedout || req->rq_resend) { | |
1637 | /* This is re-sending anyways, | |
1638 | * let's mark req as resend. */ | |
1639 | spin_lock(&req->rq_lock); | |
1640 | req->rq_resend = 1; | |
1641 | spin_unlock(&req->rq_lock); | |
1642 | if (req->rq_bulk) { | |
1643 | __u64 old_xid; | |
1644 | ||
1645 | if (!ptlrpc_unregister_bulk(req, 1)) | |
1646 | continue; | |
1647 | ||
1648 | /* ensure previous bulk fails */ | |
1649 | old_xid = req->rq_xid; | |
1650 | req->rq_xid = ptlrpc_next_xid(); | |
1651 | CDEBUG(D_HA, "resend bulk " | |
1652 | "old x"LPU64 | |
1653 | " new x"LPU64"\n", | |
1654 | old_xid, req->rq_xid); | |
1655 | } | |
1656 | } | |
1657 | /* | |
1658 | * rq_wait_ctx is only touched by ptlrpcd, | |
1659 | * so no lock is needed here. | |
1660 | */ | |
1661 | status = sptlrpc_req_refresh_ctx(req, -1); | |
1662 | if (status) { | |
1663 | if (req->rq_err) { | |
1664 | req->rq_status = status; | |
1665 | spin_lock(&req->rq_lock); | |
1666 | req->rq_wait_ctx = 0; | |
1667 | spin_unlock(&req->rq_lock); | |
1668 | force_timer_recalc = 1; | |
1669 | } else { | |
1670 | spin_lock(&req->rq_lock); | |
1671 | req->rq_wait_ctx = 1; | |
1672 | spin_unlock(&req->rq_lock); | |
1673 | } | |
1674 | ||
1675 | continue; | |
1676 | } else { | |
1677 | spin_lock(&req->rq_lock); | |
1678 | req->rq_wait_ctx = 0; | |
1679 | spin_unlock(&req->rq_lock); | |
1680 | } | |
1681 | ||
1682 | rc = ptl_send_rpc(req, 0); | |
1683 | if (rc) { | |
1684 | DEBUG_REQ(D_HA, req, | |
1685 | "send failed: rc = %d", rc); | |
1686 | force_timer_recalc = 1; | |
1687 | spin_lock(&req->rq_lock); | |
1688 | req->rq_net_err = 1; | |
1689 | spin_unlock(&req->rq_lock); | |
1690 | } | |
1691 | /* need to reset the timeout */ | |
1692 | force_timer_recalc = 1; | |
1693 | } | |
1694 | ||
1695 | spin_lock(&req->rq_lock); | |
1696 | ||
1697 | if (ptlrpc_client_early(req)) { | |
1698 | ptlrpc_at_recv_early_reply(req); | |
1699 | spin_unlock(&req->rq_lock); | |
1700 | continue; | |
1701 | } | |
1702 | ||
1703 | /* Still waiting for a reply? */ | |
1704 | if (ptlrpc_client_recv(req)) { | |
1705 | spin_unlock(&req->rq_lock); | |
1706 | continue; | |
1707 | } | |
1708 | ||
1709 | /* Did we actually receive a reply? */ | |
1710 | if (!ptlrpc_client_replied(req)) { | |
1711 | spin_unlock(&req->rq_lock); | |
1712 | continue; | |
1713 | } | |
1714 | ||
1715 | spin_unlock(&req->rq_lock); | |
1716 | ||
1717 | /* unlink from net because we are going to | |
1718 | * swab in-place of reply buffer */ | |
1719 | unregistered = ptlrpc_unregister_reply(req, 1); | |
1720 | if (!unregistered) | |
1721 | continue; | |
1722 | ||
1723 | req->rq_status = after_reply(req); | |
1724 | if (req->rq_resend) | |
1725 | continue; | |
1726 | ||
1727 | /* If there is no bulk associated with this request, | |
1728 | * then we're done and should let the interpreter | |
1729 | * process the reply. Similarly if the RPC returned | |
1730 | * an error, and therefore the bulk will never arrive. | |
1731 | */ | |
1732 | if (req->rq_bulk == NULL || req->rq_status < 0) { | |
1733 | ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); | |
1734 | GOTO(interpret, req->rq_status); | |
1735 | } | |
1736 | ||
1737 | ptlrpc_rqphase_move(req, RQ_PHASE_BULK); | |
1738 | } | |
1739 | ||
1740 | LASSERT(req->rq_phase == RQ_PHASE_BULK); | |
1741 | if (ptlrpc_client_bulk_active(req)) | |
1742 | continue; | |
1743 | ||
1744 | if (req->rq_bulk->bd_failure) { | |
1745 | /* The RPC reply arrived OK, but the bulk screwed | |
1746 | * up! Dead weird since the server told us the RPC | |
1747 | * was good after getting the REPLY for her GET or | |
1748 | * the ACK for her PUT. */ | |
1749 | DEBUG_REQ(D_ERROR, req, "bulk transfer failed"); | |
1750 | req->rq_status = -EIO; | |
1751 | } | |
1752 | ||
1753 | ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); | |
1754 | ||
1755 | interpret: | |
1756 | LASSERT(req->rq_phase == RQ_PHASE_INTERPRET); | |
1757 | ||
1758 | /* This moves to "unregistering" phase we need to wait for | |
1759 | * reply unlink. */ | |
1760 | if (!unregistered && !ptlrpc_unregister_reply(req, 1)) { | |
1761 | /* start async bulk unlink too */ | |
1762 | ptlrpc_unregister_bulk(req, 1); | |
1763 | continue; | |
1764 | } | |
1765 | ||
1766 | if (!ptlrpc_unregister_bulk(req, 1)) | |
1767 | continue; | |
1768 | ||
1769 | /* When calling interpret receiving already should be | |
1770 | * finished. */ | |
1771 | LASSERT(!req->rq_receiving_reply); | |
1772 | ||
1773 | ptlrpc_req_interpret(env, req, req->rq_status); | |
1774 | ||
1775 | ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE); | |
1776 | ||
1777 | CDEBUG(req->rq_reqmsg != NULL ? D_RPCTRACE : 0, | |
1778 | "Completed RPC pname:cluuid:pid:xid:nid:" | |
1779 | "opc %s:%s:%d:"LPU64":%s:%d\n", | |
1780 | current_comm(), imp->imp_obd->obd_uuid.uuid, | |
1781 | lustre_msg_get_status(req->rq_reqmsg), req->rq_xid, | |
1782 | libcfs_nid2str(imp->imp_connection->c_peer.nid), | |
1783 | lustre_msg_get_opc(req->rq_reqmsg)); | |
1784 | ||
1785 | spin_lock(&imp->imp_lock); | |
1786 | /* Request already may be not on sending or delaying list. This | |
1787 | * may happen in the case of marking it erroneous for the case | |
1788 | * ptlrpc_import_delay_req(req, status) find it impossible to | |
1789 | * allow sending this rpc and returns *status != 0. */ | |
1790 | if (!list_empty(&req->rq_list)) { | |
1791 | list_del_init(&req->rq_list); | |
1792 | atomic_dec(&imp->imp_inflight); | |
1793 | } | |
1794 | spin_unlock(&imp->imp_lock); | |
1795 | ||
1796 | atomic_dec(&set->set_remaining); | |
1797 | wake_up_all(&imp->imp_recovery_waitq); | |
1798 | ||
1799 | if (set->set_producer) { | |
1800 | /* produce a new request if possible */ | |
1801 | if (ptlrpc_set_producer(set) > 0) | |
1802 | force_timer_recalc = 1; | |
1803 | ||
1804 | /* free the request that has just been completed | |
1805 | * in order not to pollute set->set_requests */ | |
1806 | list_del_init(&req->rq_set_chain); | |
1807 | spin_lock(&req->rq_lock); | |
1808 | req->rq_set = NULL; | |
1809 | req->rq_invalid_rqset = 0; | |
1810 | spin_unlock(&req->rq_lock); | |
1811 | ||
1812 | /* record rq_status to compute the final status later */ | |
1813 | if (req->rq_status != 0) | |
1814 | set->set_rc = req->rq_status; | |
1815 | ptlrpc_req_finished(req); | |
1816 | } | |
1817 | } | |
1818 | ||
1819 | /* If we hit an error, we want to recover promptly. */ | |
1820 | RETURN(atomic_read(&set->set_remaining) == 0 || force_timer_recalc); | |
1821 | } | |
1822 | EXPORT_SYMBOL(ptlrpc_check_set); | |
1823 | ||
1824 | /** | |
1825 | * Time out request \a req. is \a async_unlink is set, that means do not wait | |
1826 | * until LNet actually confirms network buffer unlinking. | |
1827 | * Return 1 if we should give up further retrying attempts or 0 otherwise. | |
1828 | */ | |
1829 | int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink) | |
1830 | { | |
1831 | struct obd_import *imp = req->rq_import; | |
1832 | int rc = 0; | |
d7e09d03 PT |
1833 | |
1834 | spin_lock(&req->rq_lock); | |
1835 | req->rq_timedout = 1; | |
1836 | spin_unlock(&req->rq_lock); | |
1837 | ||
1838 | DEBUG_REQ(D_WARNING, req, "Request sent has %s: [sent "CFS_DURATION_T | |
1839 | "/real "CFS_DURATION_T"]", | |
1840 | req->rq_net_err ? "failed due to network error" : | |
1841 | ((req->rq_real_sent == 0 || | |
1842 | cfs_time_before(req->rq_real_sent, req->rq_sent) || | |
1843 | cfs_time_aftereq(req->rq_real_sent, req->rq_deadline)) ? | |
1844 | "timed out for sent delay" : "timed out for slow reply"), | |
1845 | req->rq_sent, req->rq_real_sent); | |
1846 | ||
1847 | if (imp != NULL && obd_debug_peer_on_timeout) | |
1848 | LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer); | |
1849 | ||
1850 | ptlrpc_unregister_reply(req, async_unlink); | |
1851 | ptlrpc_unregister_bulk(req, async_unlink); | |
1852 | ||
1853 | if (obd_dump_on_timeout) | |
1854 | libcfs_debug_dumplog(); | |
1855 | ||
1856 | if (imp == NULL) { | |
1857 | DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?"); | |
1858 | RETURN(1); | |
1859 | } | |
1860 | ||
1861 | atomic_inc(&imp->imp_timeouts); | |
1862 | ||
1863 | /* The DLM server doesn't want recovery run on its imports. */ | |
1864 | if (imp->imp_dlm_fake) | |
1865 | RETURN(1); | |
1866 | ||
1867 | /* If this request is for recovery or other primordial tasks, | |
1868 | * then error it out here. */ | |
1869 | if (req->rq_ctx_init || req->rq_ctx_fini || | |
1870 | req->rq_send_state != LUSTRE_IMP_FULL || | |
1871 | imp->imp_obd->obd_no_recov) { | |
1872 | DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)", | |
1873 | ptlrpc_import_state_name(req->rq_send_state), | |
1874 | ptlrpc_import_state_name(imp->imp_state)); | |
1875 | spin_lock(&req->rq_lock); | |
1876 | req->rq_status = -ETIMEDOUT; | |
1877 | req->rq_err = 1; | |
1878 | spin_unlock(&req->rq_lock); | |
1879 | RETURN(1); | |
1880 | } | |
1881 | ||
1882 | /* if a request can't be resent we can't wait for an answer after | |
1883 | the timeout */ | |
1884 | if (ptlrpc_no_resend(req)) { | |
1885 | DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:"); | |
1886 | rc = 1; | |
1887 | } | |
1888 | ||
1889 | ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg)); | |
1890 | ||
1891 | RETURN(rc); | |
1892 | } | |
1893 | ||
1894 | /** | |
1895 | * Time out all uncompleted requests in request set pointed by \a data | |
1896 | * Callback used when waiting on sets with l_wait_event. | |
1897 | * Always returns 1. | |
1898 | */ | |
1899 | int ptlrpc_expired_set(void *data) | |
1900 | { | |
1901 | struct ptlrpc_request_set *set = data; | |
1902 | struct list_head *tmp; | |
1903 | time_t now = cfs_time_current_sec(); | |
d7e09d03 PT |
1904 | |
1905 | LASSERT(set != NULL); | |
1906 | ||
1907 | /* | |
1908 | * A timeout expired. See which reqs it applies to... | |
1909 | */ | |
1910 | list_for_each (tmp, &set->set_requests) { | |
1911 | struct ptlrpc_request *req = | |
1912 | list_entry(tmp, struct ptlrpc_request, | |
1913 | rq_set_chain); | |
1914 | ||
1915 | /* don't expire request waiting for context */ | |
1916 | if (req->rq_wait_ctx) | |
1917 | continue; | |
1918 | ||
1919 | /* Request in-flight? */ | |
1920 | if (!((req->rq_phase == RQ_PHASE_RPC && | |
1921 | !req->rq_waiting && !req->rq_resend) || | |
1922 | (req->rq_phase == RQ_PHASE_BULK))) | |
1923 | continue; | |
1924 | ||
1925 | if (req->rq_timedout || /* already dealt with */ | |
1926 | req->rq_deadline > now) /* not expired */ | |
1927 | continue; | |
1928 | ||
1929 | /* Deal with this guy. Do it asynchronously to not block | |
1930 | * ptlrpcd thread. */ | |
1931 | ptlrpc_expire_one_request(req, 1); | |
1932 | } | |
1933 | ||
1934 | /* | |
1935 | * When waiting for a whole set, we always break out of the | |
1936 | * sleep so we can recalculate the timeout, or enable interrupts | |
1937 | * if everyone's timed out. | |
1938 | */ | |
1939 | RETURN(1); | |
1940 | } | |
1941 | EXPORT_SYMBOL(ptlrpc_expired_set); | |
1942 | ||
1943 | /** | |
1944 | * Sets rq_intr flag in \a req under spinlock. | |
1945 | */ | |
1946 | void ptlrpc_mark_interrupted(struct ptlrpc_request *req) | |
1947 | { | |
1948 | spin_lock(&req->rq_lock); | |
1949 | req->rq_intr = 1; | |
1950 | spin_unlock(&req->rq_lock); | |
1951 | } | |
1952 | EXPORT_SYMBOL(ptlrpc_mark_interrupted); | |
1953 | ||
1954 | /** | |
1955 | * Interrupts (sets interrupted flag) all uncompleted requests in | |
1956 | * a set \a data. Callback for l_wait_event for interruptible waits. | |
1957 | */ | |
1958 | void ptlrpc_interrupted_set(void *data) | |
1959 | { | |
1960 | struct ptlrpc_request_set *set = data; | |
1961 | struct list_head *tmp; | |
1962 | ||
1963 | LASSERT(set != NULL); | |
1964 | CDEBUG(D_RPCTRACE, "INTERRUPTED SET %p\n", set); | |
1965 | ||
1966 | list_for_each(tmp, &set->set_requests) { | |
1967 | struct ptlrpc_request *req = | |
1968 | list_entry(tmp, struct ptlrpc_request, | |
1969 | rq_set_chain); | |
1970 | ||
1971 | if (req->rq_phase != RQ_PHASE_RPC && | |
1972 | req->rq_phase != RQ_PHASE_UNREGISTERING) | |
1973 | continue; | |
1974 | ||
1975 | ptlrpc_mark_interrupted(req); | |
1976 | } | |
1977 | } | |
1978 | EXPORT_SYMBOL(ptlrpc_interrupted_set); | |
1979 | ||
1980 | /** | |
1981 | * Get the smallest timeout in the set; this does NOT set a timeout. | |
1982 | */ | |
1983 | int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set) | |
1984 | { | |
1985 | struct list_head *tmp; | |
1986 | time_t now = cfs_time_current_sec(); | |
1987 | int timeout = 0; | |
1988 | struct ptlrpc_request *req; | |
1989 | int deadline; | |
d7e09d03 PT |
1990 | |
1991 | SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */ | |
1992 | ||
1993 | list_for_each(tmp, &set->set_requests) { | |
1994 | req = list_entry(tmp, struct ptlrpc_request, rq_set_chain); | |
1995 | ||
1996 | /* | |
1997 | * Request in-flight? | |
1998 | */ | |
1999 | if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) || | |
2000 | (req->rq_phase == RQ_PHASE_BULK) || | |
2001 | (req->rq_phase == RQ_PHASE_NEW))) | |
2002 | continue; | |
2003 | ||
2004 | /* | |
2005 | * Already timed out. | |
2006 | */ | |
2007 | if (req->rq_timedout) | |
2008 | continue; | |
2009 | ||
2010 | /* | |
2011 | * Waiting for ctx. | |
2012 | */ | |
2013 | if (req->rq_wait_ctx) | |
2014 | continue; | |
2015 | ||
2016 | if (req->rq_phase == RQ_PHASE_NEW) | |
2017 | deadline = req->rq_sent; | |
2018 | else if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend) | |
2019 | deadline = req->rq_sent; | |
2020 | else | |
2021 | deadline = req->rq_sent + req->rq_timeout; | |
2022 | ||
2023 | if (deadline <= now) /* actually expired already */ | |
2024 | timeout = 1; /* ASAP */ | |
2025 | else if (timeout == 0 || timeout > deadline - now) | |
2026 | timeout = deadline - now; | |
2027 | } | |
2028 | RETURN(timeout); | |
2029 | } | |
2030 | EXPORT_SYMBOL(ptlrpc_set_next_timeout); | |
2031 | ||
2032 | /** | |
2033 | * Send all unset request from the set and then wait untill all | |
2034 | * requests in the set complete (either get a reply, timeout, get an | |
2035 | * error or otherwise be interrupted). | |
2036 | * Returns 0 on success or error code otherwise. | |
2037 | */ | |
2038 | int ptlrpc_set_wait(struct ptlrpc_request_set *set) | |
2039 | { | |
2040 | struct list_head *tmp; | |
2041 | struct ptlrpc_request *req; | |
2042 | struct l_wait_info lwi; | |
2043 | int rc, timeout; | |
d7e09d03 PT |
2044 | |
2045 | if (set->set_producer) | |
2046 | (void)ptlrpc_set_producer(set); | |
2047 | else | |
2048 | list_for_each(tmp, &set->set_requests) { | |
2049 | req = list_entry(tmp, struct ptlrpc_request, | |
2050 | rq_set_chain); | |
2051 | if (req->rq_phase == RQ_PHASE_NEW) | |
2052 | (void)ptlrpc_send_new_req(req); | |
2053 | } | |
2054 | ||
2055 | if (list_empty(&set->set_requests)) | |
2056 | RETURN(0); | |
2057 | ||
2058 | do { | |
2059 | timeout = ptlrpc_set_next_timeout(set); | |
2060 | ||
2061 | /* wait until all complete, interrupted, or an in-flight | |
2062 | * req times out */ | |
2063 | CDEBUG(D_RPCTRACE, "set %p going to sleep for %d seconds\n", | |
2064 | set, timeout); | |
2065 | ||
2066 | if (timeout == 0 && !cfs_signal_pending()) | |
2067 | /* | |
2068 | * No requests are in-flight (ether timed out | |
2069 | * or delayed), so we can allow interrupts. | |
2070 | * We still want to block for a limited time, | |
2071 | * so we allow interrupts during the timeout. | |
2072 | */ | |
2073 | lwi = LWI_TIMEOUT_INTR_ALL(cfs_time_seconds(1), | |
2074 | ptlrpc_expired_set, | |
2075 | ptlrpc_interrupted_set, set); | |
2076 | else | |
2077 | /* | |
2078 | * At least one request is in flight, so no | |
2079 | * interrupts are allowed. Wait until all | |
2080 | * complete, or an in-flight req times out. | |
2081 | */ | |
2082 | lwi = LWI_TIMEOUT(cfs_time_seconds(timeout? timeout : 1), | |
2083 | ptlrpc_expired_set, set); | |
2084 | ||
2085 | rc = l_wait_event(set->set_waitq, ptlrpc_check_set(NULL, set), &lwi); | |
2086 | ||
2087 | /* LU-769 - if we ignored the signal because it was already | |
2088 | * pending when we started, we need to handle it now or we risk | |
2089 | * it being ignored forever */ | |
2090 | if (rc == -ETIMEDOUT && !lwi.lwi_allow_intr && | |
2091 | cfs_signal_pending()) { | |
2092 | sigset_t blocked_sigs = | |
2093 | cfs_block_sigsinv(LUSTRE_FATAL_SIGS); | |
2094 | ||
2095 | /* In fact we only interrupt for the "fatal" signals | |
2096 | * like SIGINT or SIGKILL. We still ignore less | |
2097 | * important signals since ptlrpc set is not easily | |
2098 | * reentrant from userspace again */ | |
2099 | if (cfs_signal_pending()) | |
2100 | ptlrpc_interrupted_set(set); | |
2101 | cfs_restore_sigs(blocked_sigs); | |
2102 | } | |
2103 | ||
2104 | LASSERT(rc == 0 || rc == -EINTR || rc == -ETIMEDOUT); | |
2105 | ||
2106 | /* -EINTR => all requests have been flagged rq_intr so next | |
2107 | * check completes. | |
2108 | * -ETIMEDOUT => someone timed out. When all reqs have | |
2109 | * timed out, signals are enabled allowing completion with | |
2110 | * EINTR. | |
2111 | * I don't really care if we go once more round the loop in | |
2112 | * the error cases -eeb. */ | |
2113 | if (rc == 0 && atomic_read(&set->set_remaining) == 0) { | |
2114 | list_for_each(tmp, &set->set_requests) { | |
2115 | req = list_entry(tmp, struct ptlrpc_request, | |
2116 | rq_set_chain); | |
2117 | spin_lock(&req->rq_lock); | |
2118 | req->rq_invalid_rqset = 1; | |
2119 | spin_unlock(&req->rq_lock); | |
2120 | } | |
2121 | } | |
2122 | } while (rc != 0 || atomic_read(&set->set_remaining) != 0); | |
2123 | ||
2124 | LASSERT(atomic_read(&set->set_remaining) == 0); | |
2125 | ||
2126 | rc = set->set_rc; /* rq_status of already freed requests if any */ | |
2127 | list_for_each(tmp, &set->set_requests) { | |
2128 | req = list_entry(tmp, struct ptlrpc_request, rq_set_chain); | |
2129 | ||
2130 | LASSERT(req->rq_phase == RQ_PHASE_COMPLETE); | |
2131 | if (req->rq_status != 0) | |
2132 | rc = req->rq_status; | |
2133 | } | |
2134 | ||
2135 | if (set->set_interpret != NULL) { | |
2136 | int (*interpreter)(struct ptlrpc_request_set *set,void *,int) = | |
2137 | set->set_interpret; | |
2138 | rc = interpreter (set, set->set_arg, rc); | |
2139 | } else { | |
2140 | struct ptlrpc_set_cbdata *cbdata, *n; | |
2141 | int err; | |
2142 | ||
2143 | list_for_each_entry_safe(cbdata, n, | |
2144 | &set->set_cblist, psc_item) { | |
2145 | list_del_init(&cbdata->psc_item); | |
2146 | err = cbdata->psc_interpret(set, cbdata->psc_data, rc); | |
2147 | if (err && !rc) | |
2148 | rc = err; | |
2149 | OBD_FREE_PTR(cbdata); | |
2150 | } | |
2151 | } | |
2152 | ||
2153 | RETURN(rc); | |
2154 | } | |
2155 | EXPORT_SYMBOL(ptlrpc_set_wait); | |
2156 | ||
2157 | /** | |
2158 | * Helper fuction for request freeing. | |
2159 | * Called when request count reached zero and request needs to be freed. | |
2160 | * Removes request from all sorts of sending/replay lists it might be on, | |
2161 | * frees network buffers if any are present. | |
2162 | * If \a locked is set, that means caller is already holding import imp_lock | |
2163 | * and so we no longer need to reobtain it (for certain lists manipulations) | |
2164 | */ | |
2165 | static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked) | |
2166 | { | |
d7e09d03 | 2167 | if (request == NULL) { |
d7e09d03 PT |
2168 | return; |
2169 | } | |
2170 | ||
2171 | LASSERTF(!request->rq_receiving_reply, "req %p\n", request); | |
2172 | LASSERTF(request->rq_rqbd == NULL, "req %p\n",request);/* client-side */ | |
2173 | LASSERTF(list_empty(&request->rq_list), "req %p\n", request); | |
2174 | LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request); | |
2175 | LASSERTF(list_empty(&request->rq_exp_list), "req %p\n", request); | |
2176 | LASSERTF(!request->rq_replay, "req %p\n", request); | |
2177 | ||
2178 | req_capsule_fini(&request->rq_pill); | |
2179 | ||
2180 | /* We must take it off the imp_replay_list first. Otherwise, we'll set | |
2181 | * request->rq_reqmsg to NULL while osc_close is dereferencing it. */ | |
2182 | if (request->rq_import != NULL) { | |
2183 | if (!locked) | |
2184 | spin_lock(&request->rq_import->imp_lock); | |
2185 | list_del_init(&request->rq_replay_list); | |
2186 | if (!locked) | |
2187 | spin_unlock(&request->rq_import->imp_lock); | |
2188 | } | |
2189 | LASSERTF(list_empty(&request->rq_replay_list), "req %p\n", request); | |
2190 | ||
2191 | if (atomic_read(&request->rq_refcount) != 0) { | |
2192 | DEBUG_REQ(D_ERROR, request, | |
2193 | "freeing request with nonzero refcount"); | |
2194 | LBUG(); | |
2195 | } | |
2196 | ||
2197 | if (request->rq_repbuf != NULL) | |
2198 | sptlrpc_cli_free_repbuf(request); | |
2199 | if (request->rq_export != NULL) { | |
2200 | class_export_put(request->rq_export); | |
2201 | request->rq_export = NULL; | |
2202 | } | |
2203 | if (request->rq_import != NULL) { | |
2204 | class_import_put(request->rq_import); | |
2205 | request->rq_import = NULL; | |
2206 | } | |
2207 | if (request->rq_bulk != NULL) | |
2208 | ptlrpc_free_bulk_pin(request->rq_bulk); | |
2209 | ||
2210 | if (request->rq_reqbuf != NULL || request->rq_clrbuf != NULL) | |
2211 | sptlrpc_cli_free_reqbuf(request); | |
2212 | ||
2213 | if (request->rq_cli_ctx) | |
2214 | sptlrpc_req_put_ctx(request, !locked); | |
2215 | ||
2216 | if (request->rq_pool) | |
2217 | __ptlrpc_free_req_to_pool(request); | |
2218 | else | |
2219 | OBD_FREE(request, sizeof(*request)); | |
d7e09d03 PT |
2220 | } |
2221 | ||
2222 | static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked); | |
2223 | /** | |
2224 | * Drop one request reference. Must be called with import imp_lock held. | |
2225 | * When reference count drops to zero, reuqest is freed. | |
2226 | */ | |
2227 | void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request) | |
2228 | { | |
2229 | LASSERT(spin_is_locked(&request->rq_import->imp_lock)); | |
2230 | (void)__ptlrpc_req_finished(request, 1); | |
2231 | } | |
2232 | EXPORT_SYMBOL(ptlrpc_req_finished_with_imp_lock); | |
2233 | ||
2234 | /** | |
2235 | * Helper function | |
2236 | * Drops one reference count for request \a request. | |
2237 | * \a locked set indicates that caller holds import imp_lock. | |
2238 | * Frees the request whe reference count reaches zero. | |
2239 | */ | |
2240 | static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked) | |
2241 | { | |
d7e09d03 PT |
2242 | if (request == NULL) |
2243 | RETURN(1); | |
2244 | ||
2245 | if (request == LP_POISON || | |
2246 | request->rq_reqmsg == LP_POISON) { | |
2247 | CERROR("dereferencing freed request (bug 575)\n"); | |
2248 | LBUG(); | |
2249 | RETURN(1); | |
2250 | } | |
2251 | ||
2252 | DEBUG_REQ(D_INFO, request, "refcount now %u", | |
2253 | atomic_read(&request->rq_refcount) - 1); | |
2254 | ||
2255 | if (atomic_dec_and_test(&request->rq_refcount)) { | |
2256 | __ptlrpc_free_req(request, locked); | |
2257 | RETURN(1); | |
2258 | } | |
2259 | ||
2260 | RETURN(0); | |
2261 | } | |
2262 | ||
2263 | /** | |
2264 | * Drops one reference count for a request. | |
2265 | */ | |
2266 | void ptlrpc_req_finished(struct ptlrpc_request *request) | |
2267 | { | |
2268 | __ptlrpc_req_finished(request, 0); | |
2269 | } | |
2270 | EXPORT_SYMBOL(ptlrpc_req_finished); | |
2271 | ||
2272 | /** | |
2273 | * Returns xid of a \a request | |
2274 | */ | |
2275 | __u64 ptlrpc_req_xid(struct ptlrpc_request *request) | |
2276 | { | |
2277 | return request->rq_xid; | |
2278 | } | |
2279 | EXPORT_SYMBOL(ptlrpc_req_xid); | |
2280 | ||
2281 | /** | |
2282 | * Disengage the client's reply buffer from the network | |
2283 | * NB does _NOT_ unregister any client-side bulk. | |
2284 | * IDEMPOTENT, but _not_ safe against concurrent callers. | |
2285 | * The request owner (i.e. the thread doing the I/O) must call... | |
2286 | * Returns 0 on success or 1 if unregistering cannot be made. | |
2287 | */ | |
2288 | int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async) | |
2289 | { | |
2290 | int rc; | |
2291 | wait_queue_head_t *wq; | |
2292 | struct l_wait_info lwi; | |
2293 | ||
2294 | /* | |
2295 | * Might sleep. | |
2296 | */ | |
2297 | LASSERT(!in_interrupt()); | |
2298 | ||
2299 | /* | |
2300 | * Let's setup deadline for reply unlink. | |
2301 | */ | |
2302 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && | |
2303 | async && request->rq_reply_deadline == 0) | |
2304 | request->rq_reply_deadline = cfs_time_current_sec()+LONG_UNLINK; | |
2305 | ||
2306 | /* | |
2307 | * Nothing left to do. | |
2308 | */ | |
2309 | if (!ptlrpc_client_recv_or_unlink(request)) | |
2310 | RETURN(1); | |
2311 | ||
2312 | LNetMDUnlink(request->rq_reply_md_h); | |
2313 | ||
2314 | /* | |
2315 | * Let's check it once again. | |
2316 | */ | |
2317 | if (!ptlrpc_client_recv_or_unlink(request)) | |
2318 | RETURN(1); | |
2319 | ||
2320 | /* | |
2321 | * Move to "Unregistering" phase as reply was not unlinked yet. | |
2322 | */ | |
2323 | ptlrpc_rqphase_move(request, RQ_PHASE_UNREGISTERING); | |
2324 | ||
2325 | /* | |
2326 | * Do not wait for unlink to finish. | |
2327 | */ | |
2328 | if (async) | |
2329 | RETURN(0); | |
2330 | ||
2331 | /* | |
2332 | * We have to l_wait_event() whatever the result, to give liblustre | |
2333 | * a chance to run reply_in_callback(), and to make sure we've | |
2334 | * unlinked before returning a req to the pool. | |
2335 | */ | |
2336 | if (request->rq_set != NULL) | |
2337 | wq = &request->rq_set->set_waitq; | |
2338 | else | |
2339 | wq = &request->rq_reply_waitq; | |
2340 | ||
2341 | for (;;) { | |
2342 | /* Network access will complete in finite time but the HUGE | |
2343 | * timeout lets us CWARN for visibility of sluggish NALs */ | |
2344 | lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK), | |
2345 | cfs_time_seconds(1), NULL, NULL); | |
2346 | rc = l_wait_event(*wq, !ptlrpc_client_recv_or_unlink(request), | |
2347 | &lwi); | |
2348 | if (rc == 0) { | |
2349 | ptlrpc_rqphase_move(request, request->rq_next_phase); | |
2350 | RETURN(1); | |
2351 | } | |
2352 | ||
2353 | LASSERT(rc == -ETIMEDOUT); | |
2354 | DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout " | |
2355 | "rvcng=%d unlnk=%d", request->rq_receiving_reply, | |
2356 | request->rq_must_unlink); | |
2357 | } | |
2358 | RETURN(0); | |
2359 | } | |
2360 | EXPORT_SYMBOL(ptlrpc_unregister_reply); | |
2361 | ||
2362 | /** | |
2363 | * Iterates through replay_list on import and prunes | |
2364 | * all requests have transno smaller than last_committed for the | |
2365 | * import and don't have rq_replay set. | |
2366 | * Since requests are sorted in transno order, stops when meetign first | |
2367 | * transno bigger than last_committed. | |
2368 | * caller must hold imp->imp_lock | |
2369 | */ | |
2370 | void ptlrpc_free_committed(struct obd_import *imp) | |
2371 | { | |
2372 | struct list_head *tmp, *saved; | |
2373 | struct ptlrpc_request *req; | |
2374 | struct ptlrpc_request *last_req = NULL; /* temporary fire escape */ | |
d7e09d03 PT |
2375 | |
2376 | LASSERT(imp != NULL); | |
2377 | ||
2378 | LASSERT(spin_is_locked(&imp->imp_lock)); | |
2379 | ||
2380 | ||
2381 | if (imp->imp_peer_committed_transno == imp->imp_last_transno_checked && | |
2382 | imp->imp_generation == imp->imp_last_generation_checked) { | |
2383 | CDEBUG(D_INFO, "%s: skip recheck: last_committed "LPU64"\n", | |
2384 | imp->imp_obd->obd_name, imp->imp_peer_committed_transno); | |
d7e09d03 PT |
2385 | return; |
2386 | } | |
2387 | CDEBUG(D_RPCTRACE, "%s: committing for last_committed "LPU64" gen %d\n", | |
2388 | imp->imp_obd->obd_name, imp->imp_peer_committed_transno, | |
2389 | imp->imp_generation); | |
2390 | imp->imp_last_transno_checked = imp->imp_peer_committed_transno; | |
2391 | imp->imp_last_generation_checked = imp->imp_generation; | |
2392 | ||
2393 | list_for_each_safe(tmp, saved, &imp->imp_replay_list) { | |
2394 | req = list_entry(tmp, struct ptlrpc_request, | |
2395 | rq_replay_list); | |
2396 | ||
2397 | /* XXX ok to remove when 1357 resolved - rread 05/29/03 */ | |
2398 | LASSERT(req != last_req); | |
2399 | last_req = req; | |
2400 | ||
2401 | if (req->rq_transno == 0) { | |
2402 | DEBUG_REQ(D_EMERG, req, "zero transno during replay"); | |
2403 | LBUG(); | |
2404 | } | |
2405 | if (req->rq_import_generation < imp->imp_generation) { | |
2406 | DEBUG_REQ(D_RPCTRACE, req, "free request with old gen"); | |
2407 | GOTO(free_req, 0); | |
2408 | } | |
2409 | ||
2410 | if (req->rq_replay) { | |
2411 | DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)"); | |
2412 | continue; | |
2413 | } | |
2414 | ||
2415 | /* not yet committed */ | |
2416 | if (req->rq_transno > imp->imp_peer_committed_transno) { | |
2417 | DEBUG_REQ(D_RPCTRACE, req, "stopping search"); | |
2418 | break; | |
2419 | } | |
2420 | ||
2421 | DEBUG_REQ(D_INFO, req, "commit (last_committed "LPU64")", | |
2422 | imp->imp_peer_committed_transno); | |
2423 | free_req: | |
2424 | spin_lock(&req->rq_lock); | |
2425 | req->rq_replay = 0; | |
2426 | spin_unlock(&req->rq_lock); | |
2427 | if (req->rq_commit_cb != NULL) | |
2428 | req->rq_commit_cb(req); | |
2429 | list_del_init(&req->rq_replay_list); | |
2430 | __ptlrpc_req_finished(req, 1); | |
2431 | } | |
d7e09d03 PT |
2432 | } |
2433 | ||
2434 | void ptlrpc_cleanup_client(struct obd_import *imp) | |
2435 | { | |
d7e09d03 PT |
2436 | } |
2437 | EXPORT_SYMBOL(ptlrpc_cleanup_client); | |
2438 | ||
2439 | /** | |
2440 | * Schedule previously sent request for resend. | |
2441 | * For bulk requests we assign new xid (to avoid problems with | |
2442 | * lost replies and therefore several transfers landing into same buffer | |
2443 | * from different sending attempts). | |
2444 | */ | |
2445 | void ptlrpc_resend_req(struct ptlrpc_request *req) | |
2446 | { | |
2447 | DEBUG_REQ(D_HA, req, "going to resend"); | |
2448 | lustre_msg_set_handle(req->rq_reqmsg, &(struct lustre_handle){ 0 }); | |
2449 | req->rq_status = -EAGAIN; | |
2450 | ||
2451 | spin_lock(&req->rq_lock); | |
2452 | req->rq_resend = 1; | |
2453 | req->rq_net_err = 0; | |
2454 | req->rq_timedout = 0; | |
2455 | if (req->rq_bulk) { | |
2456 | __u64 old_xid = req->rq_xid; | |
2457 | ||
2458 | /* ensure previous bulk fails */ | |
2459 | req->rq_xid = ptlrpc_next_xid(); | |
2460 | CDEBUG(D_HA, "resend bulk old x"LPU64" new x"LPU64"\n", | |
2461 | old_xid, req->rq_xid); | |
2462 | } | |
2463 | ptlrpc_client_wake_req(req); | |
2464 | spin_unlock(&req->rq_lock); | |
2465 | } | |
2466 | EXPORT_SYMBOL(ptlrpc_resend_req); | |
2467 | ||
2468 | /* XXX: this function and rq_status are currently unused */ | |
2469 | void ptlrpc_restart_req(struct ptlrpc_request *req) | |
2470 | { | |
2471 | DEBUG_REQ(D_HA, req, "restarting (possibly-)completed request"); | |
2472 | req->rq_status = -ERESTARTSYS; | |
2473 | ||
2474 | spin_lock(&req->rq_lock); | |
2475 | req->rq_restart = 1; | |
2476 | req->rq_timedout = 0; | |
2477 | ptlrpc_client_wake_req(req); | |
2478 | spin_unlock(&req->rq_lock); | |
2479 | } | |
2480 | EXPORT_SYMBOL(ptlrpc_restart_req); | |
2481 | ||
2482 | /** | |
2483 | * Grab additional reference on a request \a req | |
2484 | */ | |
2485 | struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req) | |
2486 | { | |
d7e09d03 PT |
2487 | atomic_inc(&req->rq_refcount); |
2488 | RETURN(req); | |
2489 | } | |
2490 | EXPORT_SYMBOL(ptlrpc_request_addref); | |
2491 | ||
2492 | /** | |
2493 | * Add a request to import replay_list. | |
2494 | * Must be called under imp_lock | |
2495 | */ | |
2496 | void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, | |
2497 | struct obd_import *imp) | |
2498 | { | |
2499 | struct list_head *tmp; | |
2500 | ||
2501 | LASSERT(spin_is_locked(&imp->imp_lock)); | |
2502 | ||
2503 | if (req->rq_transno == 0) { | |
2504 | DEBUG_REQ(D_EMERG, req, "saving request with zero transno"); | |
2505 | LBUG(); | |
2506 | } | |
2507 | ||
2508 | /* clear this for new requests that were resent as well | |
2509 | as resent replayed requests. */ | |
2510 | lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT); | |
2511 | ||
2512 | /* don't re-add requests that have been replayed */ | |
2513 | if (!list_empty(&req->rq_replay_list)) | |
2514 | return; | |
2515 | ||
2516 | lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY); | |
2517 | ||
2518 | LASSERT(imp->imp_replayable); | |
2519 | /* Balanced in ptlrpc_free_committed, usually. */ | |
2520 | ptlrpc_request_addref(req); | |
2521 | list_for_each_prev(tmp, &imp->imp_replay_list) { | |
2522 | struct ptlrpc_request *iter = | |
2523 | list_entry(tmp, struct ptlrpc_request, | |
2524 | rq_replay_list); | |
2525 | ||
2526 | /* We may have duplicate transnos if we create and then | |
2527 | * open a file, or for closes retained if to match creating | |
2528 | * opens, so use req->rq_xid as a secondary key. | |
2529 | * (See bugs 684, 685, and 428.) | |
2530 | * XXX no longer needed, but all opens need transnos! | |
2531 | */ | |
2532 | if (iter->rq_transno > req->rq_transno) | |
2533 | continue; | |
2534 | ||
2535 | if (iter->rq_transno == req->rq_transno) { | |
2536 | LASSERT(iter->rq_xid != req->rq_xid); | |
2537 | if (iter->rq_xid > req->rq_xid) | |
2538 | continue; | |
2539 | } | |
2540 | ||
2541 | list_add(&req->rq_replay_list, &iter->rq_replay_list); | |
2542 | return; | |
2543 | } | |
2544 | ||
2545 | list_add(&req->rq_replay_list, &imp->imp_replay_list); | |
2546 | } | |
2547 | EXPORT_SYMBOL(ptlrpc_retain_replayable_request); | |
2548 | ||
2549 | /** | |
2550 | * Send request and wait until it completes. | |
2551 | * Returns request processing status. | |
2552 | */ | |
2553 | int ptlrpc_queue_wait(struct ptlrpc_request *req) | |
2554 | { | |
2555 | struct ptlrpc_request_set *set; | |
2556 | int rc; | |
d7e09d03 PT |
2557 | |
2558 | LASSERT(req->rq_set == NULL); | |
2559 | LASSERT(!req->rq_receiving_reply); | |
2560 | ||
2561 | set = ptlrpc_prep_set(); | |
2562 | if (set == NULL) { | |
2563 | CERROR("Unable to allocate ptlrpc set."); | |
2564 | RETURN(-ENOMEM); | |
2565 | } | |
2566 | ||
2567 | /* for distributed debugging */ | |
2568 | lustre_msg_set_status(req->rq_reqmsg, current_pid()); | |
2569 | ||
2570 | /* add a ref for the set (see comment in ptlrpc_set_add_req) */ | |
2571 | ptlrpc_request_addref(req); | |
2572 | ptlrpc_set_add_req(set, req); | |
2573 | rc = ptlrpc_set_wait(set); | |
2574 | ptlrpc_set_destroy(set); | |
2575 | ||
2576 | RETURN(rc); | |
2577 | } | |
2578 | EXPORT_SYMBOL(ptlrpc_queue_wait); | |
2579 | ||
2580 | struct ptlrpc_replay_async_args { | |
2581 | int praa_old_state; | |
2582 | int praa_old_status; | |
2583 | }; | |
2584 | ||
2585 | /** | |
2586 | * Callback used for replayed requests reply processing. | |
2587 | * In case of succesful reply calls registeresd request replay callback. | |
2588 | * In case of error restart replay process. | |
2589 | */ | |
2590 | static int ptlrpc_replay_interpret(const struct lu_env *env, | |
2591 | struct ptlrpc_request *req, | |
2592 | void * data, int rc) | |
2593 | { | |
2594 | struct ptlrpc_replay_async_args *aa = data; | |
2595 | struct obd_import *imp = req->rq_import; | |
2596 | ||
d7e09d03 PT |
2597 | atomic_dec(&imp->imp_replay_inflight); |
2598 | ||
2599 | if (!ptlrpc_client_replied(req)) { | |
2600 | CERROR("request replay timed out, restarting recovery\n"); | |
2601 | GOTO(out, rc = -ETIMEDOUT); | |
2602 | } | |
2603 | ||
2604 | if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR && | |
2605 | (lustre_msg_get_status(req->rq_repmsg) == -ENOTCONN || | |
2606 | lustre_msg_get_status(req->rq_repmsg) == -ENODEV)) | |
2607 | GOTO(out, rc = lustre_msg_get_status(req->rq_repmsg)); | |
2608 | ||
2609 | /** VBR: check version failure */ | |
2610 | if (lustre_msg_get_status(req->rq_repmsg) == -EOVERFLOW) { | |
2611 | /** replay was failed due to version mismatch */ | |
2612 | DEBUG_REQ(D_WARNING, req, "Version mismatch during replay\n"); | |
2613 | spin_lock(&imp->imp_lock); | |
2614 | imp->imp_vbr_failed = 1; | |
2615 | imp->imp_no_lock_replay = 1; | |
2616 | spin_unlock(&imp->imp_lock); | |
2617 | lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status); | |
2618 | } else { | |
2619 | /** The transno had better not change over replay. */ | |
2620 | LASSERTF(lustre_msg_get_transno(req->rq_reqmsg) == | |
2621 | lustre_msg_get_transno(req->rq_repmsg) || | |
2622 | lustre_msg_get_transno(req->rq_repmsg) == 0, | |
2623 | LPX64"/"LPX64"\n", | |
2624 | lustre_msg_get_transno(req->rq_reqmsg), | |
2625 | lustre_msg_get_transno(req->rq_repmsg)); | |
2626 | } | |
2627 | ||
2628 | spin_lock(&imp->imp_lock); | |
2629 | /** if replays by version then gap occur on server, no trust to locks */ | |
2630 | if (lustre_msg_get_flags(req->rq_repmsg) & MSG_VERSION_REPLAY) | |
2631 | imp->imp_no_lock_replay = 1; | |
2632 | imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg); | |
2633 | spin_unlock(&imp->imp_lock); | |
2634 | LASSERT(imp->imp_last_replay_transno); | |
2635 | ||
2636 | /* transaction number shouldn't be bigger than the latest replayed */ | |
2637 | if (req->rq_transno > lustre_msg_get_transno(req->rq_reqmsg)) { | |
2638 | DEBUG_REQ(D_ERROR, req, | |
2639 | "Reported transno "LPU64" is bigger than the " | |
2640 | "replayed one: "LPU64, req->rq_transno, | |
2641 | lustre_msg_get_transno(req->rq_reqmsg)); | |
2642 | GOTO(out, rc = -EINVAL); | |
2643 | } | |
2644 | ||
2645 | DEBUG_REQ(D_HA, req, "got rep"); | |
2646 | ||
2647 | /* let the callback do fixups, possibly including in the request */ | |
2648 | if (req->rq_replay_cb) | |
2649 | req->rq_replay_cb(req); | |
2650 | ||
2651 | if (ptlrpc_client_replied(req) && | |
2652 | lustre_msg_get_status(req->rq_repmsg) != aa->praa_old_status) { | |
2653 | DEBUG_REQ(D_ERROR, req, "status %d, old was %d", | |
2654 | lustre_msg_get_status(req->rq_repmsg), | |
2655 | aa->praa_old_status); | |
2656 | } else { | |
2657 | /* Put it back for re-replay. */ | |
2658 | lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status); | |
2659 | } | |
2660 | ||
2661 | /* | |
2662 | * Errors while replay can set transno to 0, but | |
2663 | * imp_last_replay_transno shouldn't be set to 0 anyway | |
2664 | */ | |
2665 | if (req->rq_transno == 0) | |
2666 | CERROR("Transno is 0 during replay!\n"); | |
2667 | ||
2668 | /* continue with recovery */ | |
2669 | rc = ptlrpc_import_recovery_state_machine(imp); | |
2670 | out: | |
2671 | req->rq_send_state = aa->praa_old_state; | |
2672 | ||
2673 | if (rc != 0) | |
2674 | /* this replay failed, so restart recovery */ | |
2675 | ptlrpc_connect_import(imp); | |
2676 | ||
2677 | RETURN(rc); | |
2678 | } | |
2679 | ||
2680 | /** | |
2681 | * Prepares and queues request for replay. | |
2682 | * Adds it to ptlrpcd queue for actual sending. | |
2683 | * Returns 0 on success. | |
2684 | */ | |
2685 | int ptlrpc_replay_req(struct ptlrpc_request *req) | |
2686 | { | |
2687 | struct ptlrpc_replay_async_args *aa; | |
d7e09d03 PT |
2688 | |
2689 | LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY); | |
2690 | ||
2691 | LASSERT (sizeof (*aa) <= sizeof (req->rq_async_args)); | |
2692 | aa = ptlrpc_req_async_args(req); | |
2693 | memset(aa, 0, sizeof *aa); | |
2694 | ||
2695 | /* Prepare request to be resent with ptlrpcd */ | |
2696 | aa->praa_old_state = req->rq_send_state; | |
2697 | req->rq_send_state = LUSTRE_IMP_REPLAY; | |
2698 | req->rq_phase = RQ_PHASE_NEW; | |
2699 | req->rq_next_phase = RQ_PHASE_UNDEFINED; | |
2700 | if (req->rq_repmsg) | |
2701 | aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg); | |
2702 | req->rq_status = 0; | |
2703 | req->rq_interpret_reply = ptlrpc_replay_interpret; | |
2704 | /* Readjust the timeout for current conditions */ | |
2705 | ptlrpc_at_set_req_timeout(req); | |
2706 | ||
2707 | /* Tell server the net_latency, so the server can calculate how long | |
2708 | * it should wait for next replay */ | |
2709 | lustre_msg_set_service_time(req->rq_reqmsg, | |
2710 | ptlrpc_at_get_net_latency(req)); | |
2711 | DEBUG_REQ(D_HA, req, "REPLAY"); | |
2712 | ||
2713 | atomic_inc(&req->rq_import->imp_replay_inflight); | |
2714 | ptlrpc_request_addref(req); /* ptlrpcd needs a ref */ | |
2715 | ||
2716 | ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1); | |
2717 | RETURN(0); | |
2718 | } | |
2719 | EXPORT_SYMBOL(ptlrpc_replay_req); | |
2720 | ||
2721 | /** | |
2722 | * Aborts all in-flight request on import \a imp sending and delayed lists | |
2723 | */ | |
2724 | void ptlrpc_abort_inflight(struct obd_import *imp) | |
2725 | { | |
2726 | struct list_head *tmp, *n; | |
d7e09d03 PT |
2727 | |
2728 | /* Make sure that no new requests get processed for this import. | |
2729 | * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing | |
2730 | * this flag and then putting requests on sending_list or delayed_list. | |
2731 | */ | |
2732 | spin_lock(&imp->imp_lock); | |
2733 | ||
2734 | /* XXX locking? Maybe we should remove each request with the list | |
2735 | * locked? Also, how do we know if the requests on the list are | |
2736 | * being freed at this time? | |
2737 | */ | |
2738 | list_for_each_safe(tmp, n, &imp->imp_sending_list) { | |
2739 | struct ptlrpc_request *req = | |
2740 | list_entry(tmp, struct ptlrpc_request, rq_list); | |
2741 | ||
2742 | DEBUG_REQ(D_RPCTRACE, req, "inflight"); | |
2743 | ||
2744 | spin_lock(&req->rq_lock); | |
2745 | if (req->rq_import_generation < imp->imp_generation) { | |
2746 | req->rq_err = 1; | |
2747 | req->rq_status = -EIO; | |
2748 | ptlrpc_client_wake_req(req); | |
2749 | } | |
2750 | spin_unlock(&req->rq_lock); | |
2751 | } | |
2752 | ||
2753 | list_for_each_safe(tmp, n, &imp->imp_delayed_list) { | |
2754 | struct ptlrpc_request *req = | |
2755 | list_entry(tmp, struct ptlrpc_request, rq_list); | |
2756 | ||
2757 | DEBUG_REQ(D_RPCTRACE, req, "aborting waiting req"); | |
2758 | ||
2759 | spin_lock(&req->rq_lock); | |
2760 | if (req->rq_import_generation < imp->imp_generation) { | |
2761 | req->rq_err = 1; | |
2762 | req->rq_status = -EIO; | |
2763 | ptlrpc_client_wake_req(req); | |
2764 | } | |
2765 | spin_unlock(&req->rq_lock); | |
2766 | } | |
2767 | ||
2768 | /* Last chance to free reqs left on the replay list, but we | |
2769 | * will still leak reqs that haven't committed. */ | |
2770 | if (imp->imp_replayable) | |
2771 | ptlrpc_free_committed(imp); | |
2772 | ||
2773 | spin_unlock(&imp->imp_lock); | |
d7e09d03 PT |
2774 | } |
2775 | EXPORT_SYMBOL(ptlrpc_abort_inflight); | |
2776 | ||
2777 | /** | |
2778 | * Abort all uncompleted requests in request set \a set | |
2779 | */ | |
2780 | void ptlrpc_abort_set(struct ptlrpc_request_set *set) | |
2781 | { | |
2782 | struct list_head *tmp, *pos; | |
2783 | ||
2784 | LASSERT(set != NULL); | |
2785 | ||
2786 | list_for_each_safe(pos, tmp, &set->set_requests) { | |
2787 | struct ptlrpc_request *req = | |
2788 | list_entry(pos, struct ptlrpc_request, | |
2789 | rq_set_chain); | |
2790 | ||
2791 | spin_lock(&req->rq_lock); | |
2792 | if (req->rq_phase != RQ_PHASE_RPC) { | |
2793 | spin_unlock(&req->rq_lock); | |
2794 | continue; | |
2795 | } | |
2796 | ||
2797 | req->rq_err = 1; | |
2798 | req->rq_status = -EINTR; | |
2799 | ptlrpc_client_wake_req(req); | |
2800 | spin_unlock(&req->rq_lock); | |
2801 | } | |
2802 | } | |
2803 | ||
2804 | static __u64 ptlrpc_last_xid; | |
2805 | static spinlock_t ptlrpc_last_xid_lock; | |
2806 | ||
2807 | /** | |
2808 | * Initialize the XID for the node. This is common among all requests on | |
2809 | * this node, and only requires the property that it is monotonically | |
2810 | * increasing. It does not need to be sequential. Since this is also used | |
2811 | * as the RDMA match bits, it is important that a single client NOT have | |
2812 | * the same match bits for two different in-flight requests, hence we do | |
2813 | * NOT want to have an XID per target or similar. | |
2814 | * | |
2815 | * To avoid an unlikely collision between match bits after a client reboot | |
2816 | * (which would deliver old data into the wrong RDMA buffer) initialize | |
2817 | * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s. | |
2818 | * If the time is clearly incorrect, we instead use a 62-bit random number. | |
2819 | * In the worst case the random number will overflow 1M RPCs per second in | |
2820 | * 9133 years, or permutations thereof. | |
2821 | */ | |
2822 | #define YEAR_2004 (1ULL << 30) | |
2823 | void ptlrpc_init_xid(void) | |
2824 | { | |
2825 | time_t now = cfs_time_current_sec(); | |
2826 | ||
2827 | spin_lock_init(&ptlrpc_last_xid_lock); | |
2828 | if (now < YEAR_2004) { | |
2829 | cfs_get_random_bytes(&ptlrpc_last_xid, sizeof(ptlrpc_last_xid)); | |
2830 | ptlrpc_last_xid >>= 2; | |
2831 | ptlrpc_last_xid |= (1ULL << 61); | |
2832 | } else { | |
2833 | ptlrpc_last_xid = (__u64)now << 20; | |
2834 | } | |
2835 | ||
2836 | /* Need to always be aligned to a power-of-two for mutli-bulk BRW */ | |
2837 | CLASSERT((PTLRPC_BULK_OPS_COUNT & (PTLRPC_BULK_OPS_COUNT - 1)) == 0); | |
2838 | ptlrpc_last_xid &= PTLRPC_BULK_OPS_MASK; | |
2839 | } | |
2840 | ||
2841 | /** | |
2842 | * Increase xid and returns resulting new value to the caller. | |
2843 | * | |
2844 | * Multi-bulk BRW RPCs consume multiple XIDs for each bulk transfer, starting | |
2845 | * at the returned xid, up to xid + PTLRPC_BULK_OPS_COUNT - 1. The BRW RPC | |
2846 | * itself uses the last bulk xid needed, so the server can determine the | |
2847 | * the number of bulk transfers from the RPC XID and a bitmask. The starting | |
2848 | * xid must align to a power-of-two value. | |
2849 | * | |
2850 | * This is assumed to be true due to the initial ptlrpc_last_xid | |
2851 | * value also being initialized to a power-of-two value. LU-1431 | |
2852 | */ | |
2853 | __u64 ptlrpc_next_xid(void) | |
2854 | { | |
2855 | __u64 next; | |
2856 | ||
2857 | spin_lock(&ptlrpc_last_xid_lock); | |
2858 | next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT; | |
2859 | ptlrpc_last_xid = next; | |
2860 | spin_unlock(&ptlrpc_last_xid_lock); | |
2861 | ||
2862 | return next; | |
2863 | } | |
2864 | EXPORT_SYMBOL(ptlrpc_next_xid); | |
2865 | ||
2866 | /** | |
2867 | * Get a glimpse at what next xid value might have been. | |
2868 | * Returns possible next xid. | |
2869 | */ | |
2870 | __u64 ptlrpc_sample_next_xid(void) | |
2871 | { | |
2872 | #if BITS_PER_LONG == 32 | |
2873 | /* need to avoid possible word tearing on 32-bit systems */ | |
2874 | __u64 next; | |
2875 | ||
2876 | spin_lock(&ptlrpc_last_xid_lock); | |
2877 | next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT; | |
2878 | spin_unlock(&ptlrpc_last_xid_lock); | |
2879 | ||
2880 | return next; | |
2881 | #else | |
2882 | /* No need to lock, since returned value is racy anyways */ | |
2883 | return ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT; | |
2884 | #endif | |
2885 | } | |
2886 | EXPORT_SYMBOL(ptlrpc_sample_next_xid); | |
2887 | ||
2888 | /** | |
2889 | * Functions for operating ptlrpc workers. | |
2890 | * | |
2891 | * A ptlrpc work is a function which will be running inside ptlrpc context. | |
2892 | * The callback shouldn't sleep otherwise it will block that ptlrpcd thread. | |
2893 | * | |
2894 | * 1. after a work is created, it can be used many times, that is: | |
2895 | * handler = ptlrpcd_alloc_work(); | |
2896 | * ptlrpcd_queue_work(); | |
2897 | * | |
2898 | * queue it again when necessary: | |
2899 | * ptlrpcd_queue_work(); | |
2900 | * ptlrpcd_destroy_work(); | |
2901 | * 2. ptlrpcd_queue_work() can be called by multiple processes meanwhile, but | |
2902 | * it will only be queued once in any time. Also as its name implies, it may | |
2903 | * have delay before it really runs by ptlrpcd thread. | |
2904 | */ | |
2905 | struct ptlrpc_work_async_args { | |
2906 | __u64 magic; | |
2907 | int (*cb)(const struct lu_env *, void *); | |
2908 | void *cbdata; | |
2909 | }; | |
2910 | ||
2911 | #define PTLRPC_WORK_MAGIC 0x6655436b676f4f44ULL /* magic code */ | |
2912 | ||
2913 | static int work_interpreter(const struct lu_env *env, | |
2914 | struct ptlrpc_request *req, void *data, int rc) | |
2915 | { | |
2916 | struct ptlrpc_work_async_args *arg = data; | |
2917 | ||
2918 | LASSERT(arg->magic == PTLRPC_WORK_MAGIC); | |
2919 | LASSERT(arg->cb != NULL); | |
2920 | ||
2921 | return arg->cb(env, arg->cbdata); | |
2922 | } | |
2923 | ||
2924 | /** | |
2925 | * Create a work for ptlrpc. | |
2926 | */ | |
2927 | void *ptlrpcd_alloc_work(struct obd_import *imp, | |
2928 | int (*cb)(const struct lu_env *, void *), void *cbdata) | |
2929 | { | |
2930 | struct ptlrpc_request *req = NULL; | |
2931 | struct ptlrpc_work_async_args *args; | |
d7e09d03 PT |
2932 | |
2933 | might_sleep(); | |
2934 | ||
2935 | if (cb == NULL) | |
2936 | RETURN(ERR_PTR(-EINVAL)); | |
2937 | ||
2938 | /* copy some code from deprecated fakereq. */ | |
2939 | OBD_ALLOC_PTR(req); | |
2940 | if (req == NULL) { | |
2941 | CERROR("ptlrpc: run out of memory!\n"); | |
2942 | RETURN(ERR_PTR(-ENOMEM)); | |
2943 | } | |
2944 | ||
2945 | req->rq_send_state = LUSTRE_IMP_FULL; | |
2946 | req->rq_type = PTL_RPC_MSG_REQUEST; | |
2947 | req->rq_import = class_import_get(imp); | |
2948 | req->rq_export = NULL; | |
2949 | req->rq_interpret_reply = work_interpreter; | |
2950 | /* don't want reply */ | |
2951 | req->rq_receiving_reply = 0; | |
2952 | req->rq_must_unlink = 0; | |
2953 | req->rq_no_delay = req->rq_no_resend = 1; | |
2954 | ||
2955 | spin_lock_init(&req->rq_lock); | |
2956 | INIT_LIST_HEAD(&req->rq_list); | |
2957 | INIT_LIST_HEAD(&req->rq_replay_list); | |
2958 | INIT_LIST_HEAD(&req->rq_set_chain); | |
2959 | INIT_LIST_HEAD(&req->rq_history_list); | |
2960 | INIT_LIST_HEAD(&req->rq_exp_list); | |
2961 | init_waitqueue_head(&req->rq_reply_waitq); | |
2962 | init_waitqueue_head(&req->rq_set_waitq); | |
2963 | atomic_set(&req->rq_refcount, 1); | |
2964 | ||
2965 | CLASSERT (sizeof(*args) <= sizeof(req->rq_async_args)); | |
2966 | args = ptlrpc_req_async_args(req); | |
2967 | args->magic = PTLRPC_WORK_MAGIC; | |
2968 | args->cb = cb; | |
2969 | args->cbdata = cbdata; | |
2970 | ||
2971 | RETURN(req); | |
2972 | } | |
2973 | EXPORT_SYMBOL(ptlrpcd_alloc_work); | |
2974 | ||
2975 | void ptlrpcd_destroy_work(void *handler) | |
2976 | { | |
2977 | struct ptlrpc_request *req = handler; | |
2978 | ||
2979 | if (req) | |
2980 | ptlrpc_req_finished(req); | |
2981 | } | |
2982 | EXPORT_SYMBOL(ptlrpcd_destroy_work); | |
2983 | ||
2984 | int ptlrpcd_queue_work(void *handler) | |
2985 | { | |
2986 | struct ptlrpc_request *req = handler; | |
2987 | ||
2988 | /* | |
2989 | * Check if the req is already being queued. | |
2990 | * | |
2991 | * Here comes a trick: it lacks a way of checking if a req is being | |
2992 | * processed reliably in ptlrpc. Here I have to use refcount of req | |
2993 | * for this purpose. This is okay because the caller should use this | |
2994 | * req as opaque data. - Jinshan | |
2995 | */ | |
2996 | LASSERT(atomic_read(&req->rq_refcount) > 0); | |
2997 | if (atomic_read(&req->rq_refcount) > 1) | |
2998 | return -EBUSY; | |
2999 | ||
3000 | if (atomic_inc_return(&req->rq_refcount) > 2) { /* race */ | |
3001 | atomic_dec(&req->rq_refcount); | |
3002 | return -EBUSY; | |
3003 | } | |
3004 | ||
3005 | /* re-initialize the req */ | |
3006 | req->rq_timeout = obd_timeout; | |
3007 | req->rq_sent = cfs_time_current_sec(); | |
3008 | req->rq_deadline = req->rq_sent + req->rq_timeout; | |
3009 | req->rq_reply_deadline = req->rq_deadline; | |
3010 | req->rq_phase = RQ_PHASE_INTERPRET; | |
3011 | req->rq_next_phase = RQ_PHASE_COMPLETE; | |
3012 | req->rq_xid = ptlrpc_next_xid(); | |
3013 | req->rq_import_generation = req->rq_import->imp_generation; | |
3014 | ||
3015 | ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1); | |
3016 | return 0; | |
3017 | } | |
3018 | EXPORT_SYMBOL(ptlrpcd_queue_work); |