Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
18 | * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf | |
19 | * | |
20 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, | |
21 | * CA 95054 USA or visit www.sun.com if you need additional information or | |
22 | * have any questions. | |
23 | * | |
24 | * GPL HEADER END | |
25 | */ | |
26 | /* | |
27 | * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. | |
28 | * Use is subject to license terms. | |
29 | * | |
30 | * Copyright (c) 2011, 2012, Intel Corporation. | |
31 | */ | |
32 | /* | |
33 | * This file is part of Lustre, http://www.lustre.org/ | |
34 | * Lustre is a trademark of Sun Microsystems, Inc. | |
35 | */ | |
36 | ||
37 | /** Implementation of client-side PortalRPC interfaces */ | |
38 | ||
39 | #define DEBUG_SUBSYSTEM S_RPC | |
40 | ||
e27db149 GKH |
41 | #include "../include/obd_support.h" |
42 | #include "../include/obd_class.h" | |
43 | #include "../include/lustre_lib.h" | |
44 | #include "../include/lustre_ha.h" | |
45 | #include "../include/lustre_import.h" | |
46 | #include "../include/lustre_req_layout.h" | |
d7e09d03 PT |
47 | |
48 | #include "ptlrpc_internal.h" | |
49 | ||
50 | static int ptlrpc_send_new_req(struct ptlrpc_request *req); | |
82a373ae | 51 | static int ptlrpcd_check_work(struct ptlrpc_request *req); |
d7e09d03 PT |
52 | |
53 | /** | |
54 | * Initialize passed in client structure \a cl. | |
55 | */ | |
56 | void ptlrpc_init_client(int req_portal, int rep_portal, char *name, | |
57 | struct ptlrpc_client *cl) | |
58 | { | |
59 | cl->cli_request_portal = req_portal; | |
60 | cl->cli_reply_portal = rep_portal; | |
61 | cl->cli_name = name; | |
62 | } | |
63 | EXPORT_SYMBOL(ptlrpc_init_client); | |
64 | ||
65 | /** | |
930cef9a | 66 | * Return PortalRPC connection for remote uud \a uuid |
d7e09d03 PT |
67 | */ |
68 | struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid) | |
69 | { | |
70 | struct ptlrpc_connection *c; | |
71 | lnet_nid_t self; | |
72 | lnet_process_id_t peer; | |
73 | int err; | |
74 | ||
75 | /* ptlrpc_uuid_to_peer() initializes its 2nd parameter | |
76 | * before accessing its values. */ | |
77 | /* coverity[uninit_use_in_call] */ | |
78 | err = ptlrpc_uuid_to_peer(uuid, &peer, &self); | |
79 | if (err != 0) { | |
80 | CNETERR("cannot find peer %s!\n", uuid->uuid); | |
81 | return NULL; | |
82 | } | |
83 | ||
84 | c = ptlrpc_connection_get(peer, self, uuid); | |
85 | if (c) { | |
86 | memcpy(c->c_remote_uuid.uuid, | |
87 | uuid->uuid, sizeof(c->c_remote_uuid.uuid)); | |
88 | } | |
89 | ||
90 | CDEBUG(D_INFO, "%s -> %p\n", uuid->uuid, c); | |
91 | ||
92 | return c; | |
93 | } | |
94 | EXPORT_SYMBOL(ptlrpc_uuid_to_connection); | |
95 | ||
96 | /** | |
97 | * Allocate and initialize new bulk descriptor on the sender. | |
98 | * Returns pointer to the descriptor or NULL on error. | |
99 | */ | |
100 | struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw, | |
101 | unsigned type, unsigned portal) | |
102 | { | |
103 | struct ptlrpc_bulk_desc *desc; | |
104 | int i; | |
105 | ||
106 | OBD_ALLOC(desc, offsetof(struct ptlrpc_bulk_desc, bd_iov[npages])); | |
107 | if (!desc) | |
108 | return NULL; | |
109 | ||
110 | spin_lock_init(&desc->bd_lock); | |
111 | init_waitqueue_head(&desc->bd_waitq); | |
112 | desc->bd_max_iov = npages; | |
113 | desc->bd_iov_count = 0; | |
114 | desc->bd_portal = portal; | |
115 | desc->bd_type = type; | |
116 | desc->bd_md_count = 0; | |
117 | LASSERT(max_brw > 0); | |
118 | desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT); | |
119 | /* PTLRPC_BULK_OPS_COUNT is the compile-time transfer limit for this | |
120 | * node. Negotiated ocd_brw_size will always be <= this number. */ | |
121 | for (i = 0; i < PTLRPC_BULK_OPS_COUNT; i++) | |
122 | LNetInvalidateHandle(&desc->bd_mds[i]); | |
123 | ||
124 | return desc; | |
125 | } | |
126 | ||
127 | /** | |
128 | * Prepare bulk descriptor for specified outgoing request \a req that | |
129 | * can fit \a npages * pages. \a type is bulk type. \a portal is where | |
130 | * the bulk to be sent. Used on client-side. | |
930cef9a | 131 | * Returns pointer to newly allocated initialized bulk descriptor or NULL on |
d7e09d03 PT |
132 | * error. |
133 | */ | |
134 | struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req, | |
135 | unsigned npages, unsigned max_brw, | |
136 | unsigned type, unsigned portal) | |
137 | { | |
138 | struct obd_import *imp = req->rq_import; | |
139 | struct ptlrpc_bulk_desc *desc; | |
140 | ||
d7e09d03 PT |
141 | LASSERT(type == BULK_PUT_SINK || type == BULK_GET_SOURCE); |
142 | desc = ptlrpc_new_bulk(npages, max_brw, type, portal); | |
143 | if (desc == NULL) | |
0a3bdb00 | 144 | return NULL; |
d7e09d03 PT |
145 | |
146 | desc->bd_import_generation = req->rq_import_generation; | |
147 | desc->bd_import = class_import_get(imp); | |
148 | desc->bd_req = req; | |
149 | ||
150 | desc->bd_cbid.cbid_fn = client_bulk_callback; | |
151 | desc->bd_cbid.cbid_arg = desc; | |
152 | ||
153 | /* This makes req own desc, and free it when she frees herself */ | |
154 | req->rq_bulk = desc; | |
155 | ||
156 | return desc; | |
157 | } | |
158 | EXPORT_SYMBOL(ptlrpc_prep_bulk_imp); | |
159 | ||
160 | /** | |
161 | * Add a page \a page to the bulk descriptor \a desc. | |
162 | * Data to transfer in the page starts at offset \a pageoffset and | |
163 | * amount of data to transfer from the page is \a len | |
164 | */ | |
165 | void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, | |
166 | struct page *page, int pageoffset, int len, int pin) | |
167 | { | |
168 | LASSERT(desc->bd_iov_count < desc->bd_max_iov); | |
169 | LASSERT(page != NULL); | |
170 | LASSERT(pageoffset >= 0); | |
171 | LASSERT(len > 0); | |
172 | LASSERT(pageoffset + len <= PAGE_CACHE_SIZE); | |
173 | ||
174 | desc->bd_nob += len; | |
175 | ||
176 | if (pin) | |
177 | page_cache_get(page); | |
178 | ||
179 | ptlrpc_add_bulk_page(desc, page, pageoffset, len); | |
180 | } | |
181 | EXPORT_SYMBOL(__ptlrpc_prep_bulk_page); | |
182 | ||
183 | /** | |
184 | * Uninitialize and free bulk descriptor \a desc. | |
185 | * Works on bulk descriptors both from server and client side. | |
186 | */ | |
187 | void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc, int unpin) | |
188 | { | |
189 | int i; | |
d7e09d03 PT |
190 | |
191 | LASSERT(desc != NULL); | |
192 | LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */ | |
193 | LASSERT(desc->bd_md_count == 0); /* network hands off */ | |
194 | LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL)); | |
195 | ||
196 | sptlrpc_enc_pool_put_pages(desc); | |
197 | ||
198 | if (desc->bd_export) | |
199 | class_export_put(desc->bd_export); | |
200 | else | |
201 | class_import_put(desc->bd_import); | |
202 | ||
203 | if (unpin) { | |
7b8633de | 204 | for (i = 0; i < desc->bd_iov_count; i++) |
d7e09d03 PT |
205 | page_cache_release(desc->bd_iov[i].kiov_page); |
206 | } | |
207 | ||
208 | OBD_FREE(desc, offsetof(struct ptlrpc_bulk_desc, | |
209 | bd_iov[desc->bd_max_iov])); | |
d7e09d03 PT |
210 | } |
211 | EXPORT_SYMBOL(__ptlrpc_free_bulk); | |
212 | ||
213 | /** | |
214 | * Set server timelimit for this req, i.e. how long are we willing to wait | |
215 | * for reply before timing out this request. | |
216 | */ | |
217 | void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req) | |
218 | { | |
219 | __u32 serv_est; | |
220 | int idx; | |
221 | struct imp_at *at; | |
222 | ||
223 | LASSERT(req->rq_import); | |
224 | ||
225 | if (AT_OFF) { | |
226 | /* non-AT settings */ | |
227 | /** | |
228 | * \a imp_server_timeout means this is reverse import and | |
229 | * we send (currently only) ASTs to the client and cannot afford | |
230 | * to wait too long for the reply, otherwise the other client | |
231 | * (because of which we are sending this request) would | |
232 | * timeout waiting for us | |
233 | */ | |
234 | req->rq_timeout = req->rq_import->imp_server_timeout ? | |
235 | obd_timeout / 2 : obd_timeout; | |
236 | } else { | |
237 | at = &req->rq_import->imp_at; | |
238 | idx = import_at_get_index(req->rq_import, | |
239 | req->rq_request_portal); | |
240 | serv_est = at_get(&at->iat_service_estimate[idx]); | |
241 | req->rq_timeout = at_est2timeout(serv_est); | |
242 | } | |
243 | /* We could get even fancier here, using history to predict increased | |
244 | loading... */ | |
245 | ||
246 | /* Let the server know what this RPC timeout is by putting it in the | |
247 | reqmsg*/ | |
248 | lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout); | |
249 | } | |
250 | EXPORT_SYMBOL(ptlrpc_at_set_req_timeout); | |
251 | ||
252 | /* Adjust max service estimate based on server value */ | |
253 | static void ptlrpc_at_adj_service(struct ptlrpc_request *req, | |
254 | unsigned int serv_est) | |
255 | { | |
256 | int idx; | |
257 | unsigned int oldse; | |
258 | struct imp_at *at; | |
259 | ||
260 | LASSERT(req->rq_import); | |
261 | at = &req->rq_import->imp_at; | |
262 | ||
263 | idx = import_at_get_index(req->rq_import, req->rq_request_portal); | |
264 | /* max service estimates are tracked on the server side, | |
265 | so just keep minimal history here */ | |
266 | oldse = at_measured(&at->iat_service_estimate[idx], serv_est); | |
267 | if (oldse != 0) | |
268 | CDEBUG(D_ADAPTTO, "The RPC service estimate for %s ptl %d " | |
269 | "has changed from %d to %d\n", | |
0ae015be | 270 | req->rq_import->imp_obd->obd_name, req->rq_request_portal, |
d7e09d03 PT |
271 | oldse, at_get(&at->iat_service_estimate[idx])); |
272 | } | |
273 | ||
274 | /* Expected network latency per remote node (secs) */ | |
275 | int ptlrpc_at_get_net_latency(struct ptlrpc_request *req) | |
276 | { | |
277 | return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency); | |
278 | } | |
279 | ||
280 | /* Adjust expected network latency */ | |
281 | static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req, | |
282 | unsigned int service_time) | |
283 | { | |
284 | unsigned int nl, oldnl; | |
285 | struct imp_at *at; | |
7264b8a5 | 286 | time_t now = get_seconds(); |
d7e09d03 PT |
287 | |
288 | LASSERT(req->rq_import); | |
289 | at = &req->rq_import->imp_at; | |
290 | ||
291 | /* Network latency is total time less server processing time */ | |
b533ff4b | 292 | nl = max_t(int, now - req->rq_sent - service_time, 0) + 1/*st rounding*/; |
d7e09d03 PT |
293 | if (service_time > now - req->rq_sent + 3 /* bz16408 */) |
294 | CWARN("Reported service time %u > total measured time " | |
295 | CFS_DURATION_T"\n", service_time, | |
296 | cfs_time_sub(now, req->rq_sent)); | |
297 | ||
298 | oldnl = at_measured(&at->iat_net_latency, nl); | |
299 | if (oldnl != 0) | |
300 | CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s) " | |
301 | "has changed from %d to %d\n", | |
302 | req->rq_import->imp_obd->obd_name, | |
303 | obd_uuid2str( | |
304 | &req->rq_import->imp_connection->c_remote_uuid), | |
305 | oldnl, at_get(&at->iat_net_latency)); | |
306 | } | |
307 | ||
308 | static int unpack_reply(struct ptlrpc_request *req) | |
309 | { | |
310 | int rc; | |
311 | ||
312 | if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) { | |
313 | rc = ptlrpc_unpack_rep_msg(req, req->rq_replen); | |
314 | if (rc) { | |
315 | DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc); | |
fbe7c6c7 | 316 | return -EPROTO; |
d7e09d03 PT |
317 | } |
318 | } | |
319 | ||
320 | rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); | |
321 | if (rc) { | |
322 | DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc); | |
fbe7c6c7 | 323 | return -EPROTO; |
d7e09d03 PT |
324 | } |
325 | return 0; | |
326 | } | |
327 | ||
328 | /** | |
329 | * Handle an early reply message, called with the rq_lock held. | |
330 | * If anything goes wrong just ignore it - same as if it never happened | |
331 | */ | |
332 | static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) | |
333 | { | |
334 | struct ptlrpc_request *early_req; | |
335 | time_t olddl; | |
336 | int rc; | |
d7e09d03 PT |
337 | |
338 | req->rq_early = 0; | |
339 | spin_unlock(&req->rq_lock); | |
340 | ||
341 | rc = sptlrpc_cli_unwrap_early_reply(req, &early_req); | |
342 | if (rc) { | |
343 | spin_lock(&req->rq_lock); | |
0a3bdb00 | 344 | return rc; |
d7e09d03 PT |
345 | } |
346 | ||
347 | rc = unpack_reply(early_req); | |
348 | if (rc == 0) { | |
349 | /* Expecting to increase the service time estimate here */ | |
350 | ptlrpc_at_adj_service(req, | |
351 | lustre_msg_get_timeout(early_req->rq_repmsg)); | |
352 | ptlrpc_at_adj_net_latency(req, | |
353 | lustre_msg_get_service_time(early_req->rq_repmsg)); | |
354 | } | |
355 | ||
356 | sptlrpc_cli_finish_early_reply(early_req); | |
357 | ||
358 | if (rc != 0) { | |
359 | spin_lock(&req->rq_lock); | |
0a3bdb00 | 360 | return rc; |
d7e09d03 PT |
361 | } |
362 | ||
363 | /* Adjust the local timeout for this req */ | |
364 | ptlrpc_at_set_req_timeout(req); | |
365 | ||
366 | spin_lock(&req->rq_lock); | |
367 | olddl = req->rq_deadline; | |
368 | /* server assumes it now has rq_timeout from when it sent the | |
369 | * early reply, so client should give it at least that long. */ | |
7264b8a5 | 370 | req->rq_deadline = get_seconds() + req->rq_timeout + |
d7e09d03 PT |
371 | ptlrpc_at_get_net_latency(req); |
372 | ||
373 | DEBUG_REQ(D_ADAPTTO, req, | |
374 | "Early reply #%d, new deadline in "CFS_DURATION_T"s " | |
375 | "("CFS_DURATION_T"s)", req->rq_early_count, | |
7264b8a5 | 376 | cfs_time_sub(req->rq_deadline, get_seconds()), |
d7e09d03 PT |
377 | cfs_time_sub(req->rq_deadline, olddl)); |
378 | ||
0a3bdb00 | 379 | return rc; |
d7e09d03 PT |
380 | } |
381 | ||
35b2e1b7 AS |
382 | struct kmem_cache *request_cache; |
383 | ||
384 | int ptlrpc_request_cache_init(void) | |
385 | { | |
386 | request_cache = kmem_cache_create("ptlrpc_cache", | |
387 | sizeof(struct ptlrpc_request), | |
388 | 0, SLAB_HWCACHE_ALIGN, NULL); | |
389 | return request_cache == NULL ? -ENOMEM : 0; | |
390 | } | |
391 | ||
392 | void ptlrpc_request_cache_fini(void) | |
393 | { | |
394 | kmem_cache_destroy(request_cache); | |
395 | } | |
396 | ||
f1c571dc | 397 | struct ptlrpc_request *ptlrpc_request_cache_alloc(gfp_t flags) |
35b2e1b7 AS |
398 | { |
399 | struct ptlrpc_request *req; | |
400 | ||
401 | OBD_SLAB_ALLOC_PTR_GFP(req, request_cache, flags); | |
402 | return req; | |
403 | } | |
404 | ||
405 | void ptlrpc_request_cache_free(struct ptlrpc_request *req) | |
406 | { | |
407 | OBD_SLAB_FREE_PTR(req, request_cache); | |
408 | } | |
409 | ||
d7e09d03 PT |
410 | /** |
411 | * Wind down request pool \a pool. | |
412 | * Frees all requests from the pool too | |
413 | */ | |
414 | void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool) | |
415 | { | |
416 | struct list_head *l, *tmp; | |
417 | struct ptlrpc_request *req; | |
418 | ||
419 | LASSERT(pool != NULL); | |
420 | ||
421 | spin_lock(&pool->prp_lock); | |
422 | list_for_each_safe(l, tmp, &pool->prp_req_list) { | |
423 | req = list_entry(l, struct ptlrpc_request, rq_list); | |
424 | list_del(&req->rq_list); | |
425 | LASSERT(req->rq_reqbuf); | |
426 | LASSERT(req->rq_reqbuf_len == pool->prp_rq_size); | |
427 | OBD_FREE_LARGE(req->rq_reqbuf, pool->prp_rq_size); | |
35b2e1b7 | 428 | ptlrpc_request_cache_free(req); |
d7e09d03 PT |
429 | } |
430 | spin_unlock(&pool->prp_lock); | |
431 | OBD_FREE(pool, sizeof(*pool)); | |
432 | } | |
433 | EXPORT_SYMBOL(ptlrpc_free_rq_pool); | |
434 | ||
435 | /** | |
436 | * Allocates, initializes and adds \a num_rq requests to the pool \a pool | |
437 | */ | |
438 | void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq) | |
439 | { | |
440 | int i; | |
441 | int size = 1; | |
442 | ||
443 | while (size < pool->prp_rq_size) | |
444 | size <<= 1; | |
445 | ||
446 | LASSERTF(list_empty(&pool->prp_req_list) || | |
447 | size == pool->prp_rq_size, | |
448 | "Trying to change pool size with nonempty pool " | |
449 | "from %d to %d bytes\n", pool->prp_rq_size, size); | |
450 | ||
451 | spin_lock(&pool->prp_lock); | |
452 | pool->prp_rq_size = size; | |
453 | for (i = 0; i < num_rq; i++) { | |
454 | struct ptlrpc_request *req; | |
455 | struct lustre_msg *msg; | |
456 | ||
457 | spin_unlock(&pool->prp_lock); | |
0be19afa | 458 | req = ptlrpc_request_cache_alloc(GFP_NOFS); |
d7e09d03 PT |
459 | if (!req) |
460 | return; | |
461 | OBD_ALLOC_LARGE(msg, size); | |
462 | if (!msg) { | |
35b2e1b7 | 463 | ptlrpc_request_cache_free(req); |
d7e09d03 PT |
464 | return; |
465 | } | |
466 | req->rq_reqbuf = msg; | |
467 | req->rq_reqbuf_len = size; | |
468 | req->rq_pool = pool; | |
469 | spin_lock(&pool->prp_lock); | |
470 | list_add_tail(&req->rq_list, &pool->prp_req_list); | |
471 | } | |
472 | spin_unlock(&pool->prp_lock); | |
473 | return; | |
474 | } | |
475 | EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool); | |
476 | ||
477 | /** | |
478 | * Create and initialize new request pool with given attributes: | |
479 | * \a num_rq - initial number of requests to create for the pool | |
480 | * \a msgsize - maximum message size possible for requests in thid pool | |
481 | * \a populate_pool - function to be called when more requests need to be added | |
482 | * to the pool | |
483 | * Returns pointer to newly created pool or NULL on error. | |
484 | */ | |
485 | struct ptlrpc_request_pool * | |
486 | ptlrpc_init_rq_pool(int num_rq, int msgsize, | |
487 | void (*populate_pool)(struct ptlrpc_request_pool *, int)) | |
488 | { | |
489 | struct ptlrpc_request_pool *pool; | |
490 | ||
3949015e | 491 | OBD_ALLOC(pool, sizeof(struct ptlrpc_request_pool)); |
d7e09d03 PT |
492 | if (!pool) |
493 | return NULL; | |
494 | ||
495 | /* Request next power of two for the allocation, because internally | |
496 | kernel would do exactly this */ | |
497 | ||
498 | spin_lock_init(&pool->prp_lock); | |
499 | INIT_LIST_HEAD(&pool->prp_req_list); | |
500 | pool->prp_rq_size = msgsize + SPTLRPC_MAX_PAYLOAD; | |
501 | pool->prp_populate = populate_pool; | |
502 | ||
503 | populate_pool(pool, num_rq); | |
504 | ||
505 | if (list_empty(&pool->prp_req_list)) { | |
506 | /* have not allocated a single request for the pool */ | |
3949015e | 507 | OBD_FREE(pool, sizeof(struct ptlrpc_request_pool)); |
d7e09d03 PT |
508 | pool = NULL; |
509 | } | |
510 | return pool; | |
511 | } | |
512 | EXPORT_SYMBOL(ptlrpc_init_rq_pool); | |
513 | ||
514 | /** | |
515 | * Fetches one request from pool \a pool | |
516 | */ | |
517 | static struct ptlrpc_request * | |
518 | ptlrpc_prep_req_from_pool(struct ptlrpc_request_pool *pool) | |
519 | { | |
520 | struct ptlrpc_request *request; | |
521 | struct lustre_msg *reqbuf; | |
522 | ||
523 | if (!pool) | |
524 | return NULL; | |
525 | ||
526 | spin_lock(&pool->prp_lock); | |
527 | ||
528 | /* See if we have anything in a pool, and bail out if nothing, | |
529 | * in writeout path, where this matters, this is safe to do, because | |
530 | * nothing is lost in this case, and when some in-flight requests | |
531 | * complete, this code will be called again. */ | |
532 | if (unlikely(list_empty(&pool->prp_req_list))) { | |
533 | spin_unlock(&pool->prp_lock); | |
534 | return NULL; | |
535 | } | |
536 | ||
537 | request = list_entry(pool->prp_req_list.next, struct ptlrpc_request, | |
538 | rq_list); | |
539 | list_del_init(&request->rq_list); | |
540 | spin_unlock(&pool->prp_lock); | |
541 | ||
542 | LASSERT(request->rq_reqbuf); | |
543 | LASSERT(request->rq_pool); | |
544 | ||
545 | reqbuf = request->rq_reqbuf; | |
546 | memset(request, 0, sizeof(*request)); | |
547 | request->rq_reqbuf = reqbuf; | |
548 | request->rq_reqbuf_len = pool->prp_rq_size; | |
549 | request->rq_pool = pool; | |
550 | ||
551 | return request; | |
552 | } | |
553 | ||
554 | /** | |
555 | * Returns freed \a request to pool. | |
556 | */ | |
557 | static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request) | |
558 | { | |
559 | struct ptlrpc_request_pool *pool = request->rq_pool; | |
560 | ||
561 | spin_lock(&pool->prp_lock); | |
562 | LASSERT(list_empty(&request->rq_list)); | |
563 | LASSERT(!request->rq_receiving_reply); | |
564 | list_add_tail(&request->rq_list, &pool->prp_req_list); | |
565 | spin_unlock(&pool->prp_lock); | |
566 | } | |
567 | ||
568 | static int __ptlrpc_request_bufs_pack(struct ptlrpc_request *request, | |
569 | __u32 version, int opcode, | |
570 | int count, __u32 *lengths, char **bufs, | |
571 | struct ptlrpc_cli_ctx *ctx) | |
572 | { | |
573 | struct obd_import *imp = request->rq_import; | |
574 | int rc; | |
d7e09d03 PT |
575 | |
576 | if (unlikely(ctx)) | |
577 | request->rq_cli_ctx = sptlrpc_cli_ctx_get(ctx); | |
578 | else { | |
579 | rc = sptlrpc_req_get_ctx(request); | |
580 | if (rc) | |
a9b3e8f3 | 581 | goto out_free; |
d7e09d03 PT |
582 | } |
583 | ||
584 | sptlrpc_req_set_flavor(request, opcode); | |
585 | ||
586 | rc = lustre_pack_request(request, imp->imp_msg_magic, count, | |
587 | lengths, bufs); | |
588 | if (rc) { | |
589 | LASSERT(!request->rq_pool); | |
a9b3e8f3 | 590 | goto out_ctx; |
d7e09d03 PT |
591 | } |
592 | ||
593 | lustre_msg_add_version(request->rq_reqmsg, version); | |
594 | request->rq_send_state = LUSTRE_IMP_FULL; | |
595 | request->rq_type = PTL_RPC_MSG_REQUEST; | |
596 | request->rq_export = NULL; | |
597 | ||
598 | request->rq_req_cbid.cbid_fn = request_out_callback; | |
599 | request->rq_req_cbid.cbid_arg = request; | |
600 | ||
601 | request->rq_reply_cbid.cbid_fn = reply_in_callback; | |
602 | request->rq_reply_cbid.cbid_arg = request; | |
603 | ||
604 | request->rq_reply_deadline = 0; | |
605 | request->rq_phase = RQ_PHASE_NEW; | |
606 | request->rq_next_phase = RQ_PHASE_UNDEFINED; | |
607 | ||
608 | request->rq_request_portal = imp->imp_client->cli_request_portal; | |
609 | request->rq_reply_portal = imp->imp_client->cli_reply_portal; | |
610 | ||
611 | ptlrpc_at_set_req_timeout(request); | |
612 | ||
613 | spin_lock_init(&request->rq_lock); | |
614 | INIT_LIST_HEAD(&request->rq_list); | |
615 | INIT_LIST_HEAD(&request->rq_timed_list); | |
616 | INIT_LIST_HEAD(&request->rq_replay_list); | |
617 | INIT_LIST_HEAD(&request->rq_ctx_chain); | |
618 | INIT_LIST_HEAD(&request->rq_set_chain); | |
619 | INIT_LIST_HEAD(&request->rq_history_list); | |
620 | INIT_LIST_HEAD(&request->rq_exp_list); | |
621 | init_waitqueue_head(&request->rq_reply_waitq); | |
622 | init_waitqueue_head(&request->rq_set_waitq); | |
623 | request->rq_xid = ptlrpc_next_xid(); | |
624 | atomic_set(&request->rq_refcount, 1); | |
625 | ||
626 | lustre_msg_set_opc(request->rq_reqmsg, opcode); | |
627 | ||
0a3bdb00 | 628 | return 0; |
d7e09d03 PT |
629 | out_ctx: |
630 | sptlrpc_cli_ctx_put(request->rq_cli_ctx, 1); | |
631 | out_free: | |
632 | class_import_put(imp); | |
633 | return rc; | |
634 | } | |
635 | ||
636 | int ptlrpc_request_bufs_pack(struct ptlrpc_request *request, | |
637 | __u32 version, int opcode, char **bufs, | |
638 | struct ptlrpc_cli_ctx *ctx) | |
639 | { | |
640 | int count; | |
641 | ||
642 | count = req_capsule_filled_sizes(&request->rq_pill, RCL_CLIENT); | |
643 | return __ptlrpc_request_bufs_pack(request, version, opcode, count, | |
644 | request->rq_pill.rc_area[RCL_CLIENT], | |
645 | bufs, ctx); | |
646 | } | |
647 | EXPORT_SYMBOL(ptlrpc_request_bufs_pack); | |
648 | ||
649 | /** | |
650 | * Pack request buffers for network transfer, performing necessary encryption | |
651 | * steps if necessary. | |
652 | */ | |
653 | int ptlrpc_request_pack(struct ptlrpc_request *request, | |
654 | __u32 version, int opcode) | |
655 | { | |
656 | int rc; | |
657 | rc = ptlrpc_request_bufs_pack(request, version, opcode, NULL, NULL); | |
658 | if (rc) | |
659 | return rc; | |
660 | ||
661 | /* For some old 1.8 clients (< 1.8.7), they will LASSERT the size of | |
662 | * ptlrpc_body sent from server equal to local ptlrpc_body size, so we | |
930cef9a | 663 | * have to send old ptlrpc_body to keep interoperability with these |
d7e09d03 PT |
664 | * clients. |
665 | * | |
666 | * Only three kinds of server->client RPCs so far: | |
667 | * - LDLM_BL_CALLBACK | |
668 | * - LDLM_CP_CALLBACK | |
669 | * - LDLM_GL_CALLBACK | |
670 | * | |
930cef9a | 671 | * XXX This should be removed whenever we drop the interoperability with |
d7e09d03 PT |
672 | * the these old clients. |
673 | */ | |
674 | if (opcode == LDLM_BL_CALLBACK || opcode == LDLM_CP_CALLBACK || | |
675 | opcode == LDLM_GL_CALLBACK) | |
676 | req_capsule_shrink(&request->rq_pill, &RMF_PTLRPC_BODY, | |
677 | sizeof(struct ptlrpc_body_v2), RCL_CLIENT); | |
678 | ||
679 | return rc; | |
680 | } | |
681 | EXPORT_SYMBOL(ptlrpc_request_pack); | |
682 | ||
683 | /** | |
684 | * Helper function to allocate new request on import \a imp | |
685 | * and possibly using existing request from pool \a pool if provided. | |
686 | * Returns allocated request structure with import field filled or | |
687 | * NULL on error. | |
688 | */ | |
689 | static inline | |
690 | struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp, | |
691 | struct ptlrpc_request_pool *pool) | |
692 | { | |
693 | struct ptlrpc_request *request = NULL; | |
694 | ||
695 | if (pool) | |
696 | request = ptlrpc_prep_req_from_pool(pool); | |
697 | ||
698 | if (!request) | |
0be19afa | 699 | request = ptlrpc_request_cache_alloc(GFP_NOFS); |
d7e09d03 PT |
700 | |
701 | if (request) { | |
702 | LASSERTF((unsigned long)imp > 0x1000, "%p", imp); | |
703 | LASSERT(imp != LP_POISON); | |
704 | LASSERTF((unsigned long)imp->imp_client > 0x1000, "%p", | |
705 | imp->imp_client); | |
706 | LASSERT(imp->imp_client != LP_POISON); | |
707 | ||
708 | request->rq_import = class_import_get(imp); | |
709 | } else { | |
710 | CERROR("request allocation out of memory\n"); | |
711 | } | |
712 | ||
713 | return request; | |
714 | } | |
715 | ||
716 | /** | |
717 | * Helper function for creating a request. | |
930cef9a | 718 | * Calls __ptlrpc_request_alloc to allocate new request structure and inits |
d7e09d03 PT |
719 | * buffer structures according to capsule template \a format. |
720 | * Returns allocated request structure pointer or NULL on error. | |
721 | */ | |
722 | static struct ptlrpc_request * | |
723 | ptlrpc_request_alloc_internal(struct obd_import *imp, | |
0028d585 | 724 | struct ptlrpc_request_pool *pool, |
d7e09d03 PT |
725 | const struct req_format *format) |
726 | { | |
727 | struct ptlrpc_request *request; | |
728 | ||
729 | request = __ptlrpc_request_alloc(imp, pool); | |
730 | if (request == NULL) | |
731 | return NULL; | |
732 | ||
733 | req_capsule_init(&request->rq_pill, request, RCL_CLIENT); | |
734 | req_capsule_set(&request->rq_pill, format); | |
735 | return request; | |
736 | } | |
737 | ||
738 | /** | |
739 | * Allocate new request structure for import \a imp and initialize its | |
740 | * buffer structure according to capsule template \a format. | |
741 | */ | |
742 | struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp, | |
743 | const struct req_format *format) | |
744 | { | |
745 | return ptlrpc_request_alloc_internal(imp, NULL, format); | |
746 | } | |
747 | EXPORT_SYMBOL(ptlrpc_request_alloc); | |
748 | ||
749 | /** | |
750 | * Allocate new request structure for import \a imp from pool \a pool and | |
751 | * initialize its buffer structure according to capsule template \a format. | |
752 | */ | |
753 | struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp, | |
0028d585 | 754 | struct ptlrpc_request_pool *pool, |
d7e09d03 PT |
755 | const struct req_format *format) |
756 | { | |
757 | return ptlrpc_request_alloc_internal(imp, pool, format); | |
758 | } | |
759 | EXPORT_SYMBOL(ptlrpc_request_alloc_pool); | |
760 | ||
761 | /** | |
762 | * For requests not from pool, free memory of the request structure. | |
763 | * For requests obtained from a pool earlier, return request back to pool. | |
764 | */ | |
765 | void ptlrpc_request_free(struct ptlrpc_request *request) | |
766 | { | |
767 | if (request->rq_pool) | |
768 | __ptlrpc_free_req_to_pool(request); | |
769 | else | |
35b2e1b7 | 770 | ptlrpc_request_cache_free(request); |
d7e09d03 PT |
771 | } |
772 | EXPORT_SYMBOL(ptlrpc_request_free); | |
773 | ||
774 | /** | |
930cef9a | 775 | * Allocate new request for operation \a opcode and immediately pack it for |
d7e09d03 PT |
776 | * network transfer. |
777 | * Only used for simple requests like OBD_PING where the only important | |
778 | * part of the request is operation itself. | |
779 | * Returns allocated request or NULL on error. | |
780 | */ | |
781 | struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp, | |
782 | const struct req_format *format, | |
783 | __u32 version, int opcode) | |
784 | { | |
785 | struct ptlrpc_request *req = ptlrpc_request_alloc(imp, format); | |
786 | int rc; | |
787 | ||
788 | if (req) { | |
789 | rc = ptlrpc_request_pack(req, version, opcode); | |
790 | if (rc) { | |
791 | ptlrpc_request_free(req); | |
792 | req = NULL; | |
793 | } | |
794 | } | |
795 | return req; | |
796 | } | |
797 | EXPORT_SYMBOL(ptlrpc_request_alloc_pack); | |
798 | ||
799 | /** | |
930cef9a | 800 | * Prepare request (fetched from pool \a pool if not NULL) on import \a imp |
d7e09d03 PT |
801 | * for operation \a opcode. Request would contain \a count buffers. |
802 | * Sizes of buffers are described in array \a lengths and buffers themselves | |
803 | * are provided by a pointer \a bufs. | |
804 | * Returns prepared request structure pointer or NULL on error. | |
805 | */ | |
806 | struct ptlrpc_request * | |
807 | ptlrpc_prep_req_pool(struct obd_import *imp, | |
808 | __u32 version, int opcode, | |
809 | int count, __u32 *lengths, char **bufs, | |
810 | struct ptlrpc_request_pool *pool) | |
811 | { | |
812 | struct ptlrpc_request *request; | |
813 | int rc; | |
814 | ||
815 | request = __ptlrpc_request_alloc(imp, pool); | |
816 | if (!request) | |
817 | return NULL; | |
818 | ||
819 | rc = __ptlrpc_request_bufs_pack(request, version, opcode, count, | |
820 | lengths, bufs, NULL); | |
821 | if (rc) { | |
822 | ptlrpc_request_free(request); | |
823 | request = NULL; | |
824 | } | |
825 | return request; | |
826 | } | |
827 | EXPORT_SYMBOL(ptlrpc_prep_req_pool); | |
828 | ||
829 | /** | |
830 | * Same as ptlrpc_prep_req_pool, but without pool | |
831 | */ | |
832 | struct ptlrpc_request * | |
833 | ptlrpc_prep_req(struct obd_import *imp, __u32 version, int opcode, int count, | |
834 | __u32 *lengths, char **bufs) | |
835 | { | |
836 | return ptlrpc_prep_req_pool(imp, version, opcode, count, lengths, bufs, | |
837 | NULL); | |
838 | } | |
839 | EXPORT_SYMBOL(ptlrpc_prep_req); | |
840 | ||
841 | /** | |
842 | * Allocate and initialize new request set structure. | |
843 | * Returns a pointer to the newly allocated set structure or NULL on error. | |
844 | */ | |
845 | struct ptlrpc_request_set *ptlrpc_prep_set(void) | |
846 | { | |
847 | struct ptlrpc_request_set *set; | |
848 | ||
ec83e611 | 849 | OBD_ALLOC(set, sizeof(*set)); |
d7e09d03 | 850 | if (!set) |
0a3bdb00 | 851 | return NULL; |
d7e09d03 PT |
852 | atomic_set(&set->set_refcount, 1); |
853 | INIT_LIST_HEAD(&set->set_requests); | |
854 | init_waitqueue_head(&set->set_waitq); | |
855 | atomic_set(&set->set_new_count, 0); | |
856 | atomic_set(&set->set_remaining, 0); | |
857 | spin_lock_init(&set->set_new_req_lock); | |
858 | INIT_LIST_HEAD(&set->set_new_requests); | |
859 | INIT_LIST_HEAD(&set->set_cblist); | |
860 | set->set_max_inflight = UINT_MAX; | |
861 | set->set_producer = NULL; | |
862 | set->set_producer_arg = NULL; | |
863 | set->set_rc = 0; | |
864 | ||
0a3bdb00 | 865 | return set; |
d7e09d03 PT |
866 | } |
867 | EXPORT_SYMBOL(ptlrpc_prep_set); | |
868 | ||
869 | /** | |
870 | * Allocate and initialize new request set structure with flow control | |
871 | * extension. This extension allows to control the number of requests in-flight | |
872 | * for the whole set. A callback function to generate requests must be provided | |
873 | * and the request set will keep the number of requests sent over the wire to | |
874 | * @max_inflight. | |
875 | * Returns a pointer to the newly allocated set structure or NULL on error. | |
876 | */ | |
877 | struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func, | |
878 | void *arg) | |
879 | ||
880 | { | |
881 | struct ptlrpc_request_set *set; | |
882 | ||
883 | set = ptlrpc_prep_set(); | |
884 | if (!set) | |
0a3bdb00 | 885 | return NULL; |
d7e09d03 PT |
886 | |
887 | set->set_max_inflight = max; | |
888 | set->set_producer = func; | |
889 | set->set_producer_arg = arg; | |
890 | ||
0a3bdb00 | 891 | return set; |
d7e09d03 PT |
892 | } |
893 | EXPORT_SYMBOL(ptlrpc_prep_fcset); | |
894 | ||
895 | /** | |
896 | * Wind down and free request set structure previously allocated with | |
897 | * ptlrpc_prep_set. | |
898 | * Ensures that all requests on the set have completed and removes | |
899 | * all requests from the request list in a set. | |
900 | * If any unsent request happen to be on the list, pretends that they got | |
901 | * an error in flight and calls their completion handler. | |
902 | */ | |
903 | void ptlrpc_set_destroy(struct ptlrpc_request_set *set) | |
904 | { | |
905 | struct list_head *tmp; | |
906 | struct list_head *next; | |
907 | int expected_phase; | |
908 | int n = 0; | |
d7e09d03 PT |
909 | |
910 | /* Requests on the set should either all be completed, or all be new */ | |
911 | expected_phase = (atomic_read(&set->set_remaining) == 0) ? | |
912 | RQ_PHASE_COMPLETE : RQ_PHASE_NEW; | |
3949015e | 913 | list_for_each(tmp, &set->set_requests) { |
d7e09d03 PT |
914 | struct ptlrpc_request *req = |
915 | list_entry(tmp, struct ptlrpc_request, | |
916 | rq_set_chain); | |
917 | ||
918 | LASSERT(req->rq_phase == expected_phase); | |
919 | n++; | |
920 | } | |
921 | ||
922 | LASSERTF(atomic_read(&set->set_remaining) == 0 || | |
923 | atomic_read(&set->set_remaining) == n, "%d / %d\n", | |
924 | atomic_read(&set->set_remaining), n); | |
925 | ||
926 | list_for_each_safe(tmp, next, &set->set_requests) { | |
927 | struct ptlrpc_request *req = | |
928 | list_entry(tmp, struct ptlrpc_request, | |
929 | rq_set_chain); | |
930 | list_del_init(&req->rq_set_chain); | |
931 | ||
932 | LASSERT(req->rq_phase == expected_phase); | |
933 | ||
934 | if (req->rq_phase == RQ_PHASE_NEW) { | |
935 | ptlrpc_req_interpret(NULL, req, -EBADR); | |
936 | atomic_dec(&set->set_remaining); | |
937 | } | |
938 | ||
939 | spin_lock(&req->rq_lock); | |
940 | req->rq_set = NULL; | |
941 | req->rq_invalid_rqset = 0; | |
942 | spin_unlock(&req->rq_lock); | |
943 | ||
3949015e | 944 | ptlrpc_req_finished(req); |
d7e09d03 PT |
945 | } |
946 | ||
947 | LASSERT(atomic_read(&set->set_remaining) == 0); | |
948 | ||
949 | ptlrpc_reqset_put(set); | |
d7e09d03 PT |
950 | } |
951 | EXPORT_SYMBOL(ptlrpc_set_destroy); | |
952 | ||
953 | /** | |
954 | * Add a callback function \a fn to the set. | |
955 | * This function would be called when all requests on this set are completed. | |
956 | * The function will be passed \a data argument. | |
957 | */ | |
958 | int ptlrpc_set_add_cb(struct ptlrpc_request_set *set, | |
959 | set_interpreter_func fn, void *data) | |
960 | { | |
961 | struct ptlrpc_set_cbdata *cbdata; | |
962 | ||
963 | OBD_ALLOC_PTR(cbdata); | |
964 | if (cbdata == NULL) | |
0a3bdb00 | 965 | return -ENOMEM; |
d7e09d03 PT |
966 | |
967 | cbdata->psc_interpret = fn; | |
968 | cbdata->psc_data = data; | |
969 | list_add_tail(&cbdata->psc_item, &set->set_cblist); | |
970 | ||
0a3bdb00 | 971 | return 0; |
d7e09d03 PT |
972 | } |
973 | EXPORT_SYMBOL(ptlrpc_set_add_cb); | |
974 | ||
975 | /** | |
976 | * Add a new request to the general purpose request set. | |
977 | * Assumes request reference from the caller. | |
978 | */ | |
979 | void ptlrpc_set_add_req(struct ptlrpc_request_set *set, | |
980 | struct ptlrpc_request *req) | |
981 | { | |
982 | LASSERT(list_empty(&req->rq_set_chain)); | |
983 | ||
984 | /* The set takes over the caller's request reference */ | |
985 | list_add_tail(&req->rq_set_chain, &set->set_requests); | |
986 | req->rq_set = set; | |
987 | atomic_inc(&set->set_remaining); | |
988 | req->rq_queued_time = cfs_time_current(); | |
989 | ||
990 | if (req->rq_reqmsg != NULL) | |
991 | lustre_msg_set_jobid(req->rq_reqmsg, NULL); | |
992 | ||
993 | if (set->set_producer != NULL) | |
994 | /* If the request set has a producer callback, the RPC must be | |
995 | * sent straight away */ | |
996 | ptlrpc_send_new_req(req); | |
997 | } | |
998 | EXPORT_SYMBOL(ptlrpc_set_add_req); | |
999 | ||
1000 | /** | |
1001 | * Add a request to a request with dedicated server thread | |
1002 | * and wake the thread to make any necessary processing. | |
1003 | * Currently only used for ptlrpcd. | |
1004 | */ | |
1005 | void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc, | |
1006 | struct ptlrpc_request *req) | |
1007 | { | |
1008 | struct ptlrpc_request_set *set = pc->pc_set; | |
1009 | int count, i; | |
1010 | ||
1011 | LASSERT(req->rq_set == NULL); | |
1012 | LASSERT(test_bit(LIOD_STOP, &pc->pc_flags) == 0); | |
1013 | ||
1014 | spin_lock(&set->set_new_req_lock); | |
1015 | /* | |
1016 | * The set takes over the caller's request reference. | |
1017 | */ | |
1018 | req->rq_set = set; | |
1019 | req->rq_queued_time = cfs_time_current(); | |
1020 | list_add_tail(&req->rq_set_chain, &set->set_new_requests); | |
1021 | count = atomic_inc_return(&set->set_new_count); | |
1022 | spin_unlock(&set->set_new_req_lock); | |
1023 | ||
1024 | /* Only need to call wakeup once for the first entry. */ | |
1025 | if (count == 1) { | |
1026 | wake_up(&set->set_waitq); | |
1027 | ||
1028 | /* XXX: It maybe unnecessary to wakeup all the partners. But to | |
1029 | * guarantee the async RPC can be processed ASAP, we have | |
1030 | * no other better choice. It maybe fixed in future. */ | |
1031 | for (i = 0; i < pc->pc_npartners; i++) | |
1032 | wake_up(&pc->pc_partners[i]->pc_set->set_waitq); | |
1033 | } | |
1034 | } | |
1035 | EXPORT_SYMBOL(ptlrpc_set_add_new_req); | |
1036 | ||
1037 | /** | |
1038 | * Based on the current state of the import, determine if the request | |
1039 | * can be sent, is an error, or should be delayed. | |
1040 | * | |
1041 | * Returns true if this request should be delayed. If false, and | |
1042 | * *status is set, then the request can not be sent and *status is the | |
1043 | * error code. If false and status is 0, then request can be sent. | |
1044 | * | |
1045 | * The imp->imp_lock must be held. | |
1046 | */ | |
1047 | static int ptlrpc_import_delay_req(struct obd_import *imp, | |
1048 | struct ptlrpc_request *req, int *status) | |
1049 | { | |
1050 | int delay = 0; | |
d7e09d03 | 1051 | |
3949015e | 1052 | LASSERT(status != NULL); |
d7e09d03 PT |
1053 | *status = 0; |
1054 | ||
1055 | if (req->rq_ctx_init || req->rq_ctx_fini) { | |
1056 | /* always allow ctx init/fini rpc go through */ | |
1057 | } else if (imp->imp_state == LUSTRE_IMP_NEW) { | |
1058 | DEBUG_REQ(D_ERROR, req, "Uninitialized import."); | |
1059 | *status = -EIO; | |
1060 | } else if (imp->imp_state == LUSTRE_IMP_CLOSED) { | |
1061 | /* pings may safely race with umount */ | |
1062 | DEBUG_REQ(lustre_msg_get_opc(req->rq_reqmsg) == OBD_PING ? | |
1063 | D_HA : D_ERROR, req, "IMP_CLOSED "); | |
1064 | *status = -EIO; | |
1065 | } else if (ptlrpc_send_limit_expired(req)) { | |
1066 | /* probably doesn't need to be a D_ERROR after initial testing */ | |
1067 | DEBUG_REQ(D_ERROR, req, "send limit expired "); | |
1068 | *status = -EIO; | |
1069 | } else if (req->rq_send_state == LUSTRE_IMP_CONNECTING && | |
1070 | imp->imp_state == LUSTRE_IMP_CONNECTING) { | |
7b8633de | 1071 | /* allow CONNECT even if import is invalid */ |
d7e09d03 PT |
1072 | if (atomic_read(&imp->imp_inval_count) != 0) { |
1073 | DEBUG_REQ(D_ERROR, req, "invalidate in flight"); | |
1074 | *status = -EIO; | |
1075 | } | |
1076 | } else if (imp->imp_invalid || imp->imp_obd->obd_no_recov) { | |
1077 | if (!imp->imp_deactive) | |
1078 | DEBUG_REQ(D_NET, req, "IMP_INVALID"); | |
1079 | *status = -ESHUTDOWN; /* bz 12940 */ | |
1080 | } else if (req->rq_import_generation != imp->imp_generation) { | |
1081 | DEBUG_REQ(D_ERROR, req, "req wrong generation:"); | |
1082 | *status = -EIO; | |
1083 | } else if (req->rq_send_state != imp->imp_state) { | |
1084 | /* invalidate in progress - any requests should be drop */ | |
1085 | if (atomic_read(&imp->imp_inval_count) != 0) { | |
1086 | DEBUG_REQ(D_ERROR, req, "invalidate in flight"); | |
1087 | *status = -EIO; | |
1088 | } else if (imp->imp_dlm_fake || req->rq_no_delay) { | |
1089 | *status = -EWOULDBLOCK; | |
1090 | } else if (req->rq_allow_replay && | |
1091 | (imp->imp_state == LUSTRE_IMP_REPLAY || | |
1092 | imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS || | |
1093 | imp->imp_state == LUSTRE_IMP_REPLAY_WAIT || | |
1094 | imp->imp_state == LUSTRE_IMP_RECOVER)) { | |
1095 | DEBUG_REQ(D_HA, req, "allow during recovery.\n"); | |
1096 | } else { | |
1097 | delay = 1; | |
1098 | } | |
1099 | } | |
1100 | ||
0a3bdb00 | 1101 | return delay; |
d7e09d03 PT |
1102 | } |
1103 | ||
1104 | /** | |
930cef9a | 1105 | * Decide if the error message regarding provided request \a req |
d7e09d03 PT |
1106 | * should be printed to the console or not. |
1107 | * Makes it's decision on request status and other properties. | |
1108 | * Returns 1 to print error on the system console or 0 if not. | |
1109 | */ | |
1110 | static int ptlrpc_console_allow(struct ptlrpc_request *req) | |
1111 | { | |
1112 | __u32 opc; | |
1113 | int err; | |
1114 | ||
1115 | LASSERT(req->rq_reqmsg != NULL); | |
1116 | opc = lustre_msg_get_opc(req->rq_reqmsg); | |
1117 | ||
1118 | /* Suppress particular reconnect errors which are to be expected. No | |
1119 | * errors are suppressed for the initial connection on an import */ | |
1120 | if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) && | |
1121 | (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) { | |
1122 | ||
1123 | /* Suppress timed out reconnect requests */ | |
1124 | if (req->rq_timedout) | |
1125 | return 0; | |
1126 | ||
1127 | /* Suppress unavailable/again reconnect requests */ | |
1128 | err = lustre_msg_get_status(req->rq_repmsg); | |
1129 | if (err == -ENODEV || err == -EAGAIN) | |
1130 | return 0; | |
1131 | } | |
1132 | ||
1133 | return 1; | |
1134 | } | |
1135 | ||
1136 | /** | |
1137 | * Check request processing status. | |
1138 | * Returns the status. | |
1139 | */ | |
1140 | static int ptlrpc_check_status(struct ptlrpc_request *req) | |
1141 | { | |
1142 | int err; | |
d7e09d03 PT |
1143 | |
1144 | err = lustre_msg_get_status(req->rq_repmsg); | |
1145 | if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) { | |
1146 | struct obd_import *imp = req->rq_import; | |
1147 | __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); | |
1148 | if (ptlrpc_console_allow(req)) | |
1149 | LCONSOLE_ERROR_MSG(0x011, "%s: Communicating with %s," | |
1150 | " operation %s failed with %d.\n", | |
1151 | imp->imp_obd->obd_name, | |
1152 | libcfs_nid2str( | |
1153 | imp->imp_connection->c_peer.nid), | |
1154 | ll_opcode2str(opc), err); | |
0a3bdb00 | 1155 | return err < 0 ? err : -EINVAL; |
d7e09d03 PT |
1156 | } |
1157 | ||
1158 | if (err < 0) { | |
1159 | DEBUG_REQ(D_INFO, req, "status is %d", err); | |
1160 | } else if (err > 0) { | |
1161 | /* XXX: translate this error from net to host */ | |
1162 | DEBUG_REQ(D_INFO, req, "status is %d", err); | |
1163 | } | |
1164 | ||
0a3bdb00 | 1165 | return err; |
d7e09d03 PT |
1166 | } |
1167 | ||
1168 | /** | |
1169 | * save pre-versions of objects into request for replay. | |
1170 | * Versions are obtained from server reply. | |
1171 | * used for VBR. | |
1172 | */ | |
1173 | static void ptlrpc_save_versions(struct ptlrpc_request *req) | |
1174 | { | |
1175 | struct lustre_msg *repmsg = req->rq_repmsg; | |
1176 | struct lustre_msg *reqmsg = req->rq_reqmsg; | |
1177 | __u64 *versions = lustre_msg_get_versions(repmsg); | |
d7e09d03 PT |
1178 | |
1179 | if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) | |
1180 | return; | |
1181 | ||
1182 | LASSERT(versions); | |
1183 | lustre_msg_set_versions(reqmsg, versions); | |
55f5a824 | 1184 | CDEBUG(D_INFO, "Client save versions [%#llx/%#llx]\n", |
d7e09d03 | 1185 | versions[0], versions[1]); |
d7e09d03 PT |
1186 | } |
1187 | ||
1188 | /** | |
1189 | * Callback function called when client receives RPC reply for \a req. | |
1190 | * Returns 0 on success or error code. | |
930cef9a | 1191 | * The return value would be assigned to req->rq_status by the caller |
d7e09d03 PT |
1192 | * as request processing status. |
1193 | * This function also decides if the request needs to be saved for later replay. | |
1194 | */ | |
1195 | static int after_reply(struct ptlrpc_request *req) | |
1196 | { | |
1197 | struct obd_import *imp = req->rq_import; | |
1198 | struct obd_device *obd = req->rq_import->imp_obd; | |
1199 | int rc; | |
1200 | struct timeval work_start; | |
1201 | long timediff; | |
d7e09d03 PT |
1202 | |
1203 | LASSERT(obd != NULL); | |
1204 | /* repbuf must be unlinked */ | |
cf378ff7 | 1205 | LASSERT(!req->rq_receiving_reply && !req->rq_reply_unlink); |
d7e09d03 PT |
1206 | |
1207 | if (req->rq_reply_truncate) { | |
1208 | if (ptlrpc_no_resend(req)) { | |
1209 | DEBUG_REQ(D_ERROR, req, "reply buffer overflow," | |
1210 | " expected: %d, actual size: %d", | |
1211 | req->rq_nob_received, req->rq_repbuf_len); | |
0a3bdb00 | 1212 | return -EOVERFLOW; |
d7e09d03 PT |
1213 | } |
1214 | ||
1215 | sptlrpc_cli_free_repbuf(req); | |
1216 | /* Pass the required reply buffer size (include | |
1217 | * space for early reply). | |
1218 | * NB: no need to roundup because alloc_repbuf | |
1219 | * will roundup it */ | |
1220 | req->rq_replen = req->rq_nob_received; | |
1221 | req->rq_nob_received = 0; | |
15c50ccc | 1222 | spin_lock(&req->rq_lock); |
d7e09d03 | 1223 | req->rq_resend = 1; |
15c50ccc | 1224 | spin_unlock(&req->rq_lock); |
0a3bdb00 | 1225 | return 0; |
d7e09d03 PT |
1226 | } |
1227 | ||
1228 | /* | |
1229 | * NB Until this point, the whole of the incoming message, | |
1230 | * including buflens, status etc is in the sender's byte order. | |
1231 | */ | |
1232 | rc = sptlrpc_cli_unwrap_reply(req); | |
1233 | if (rc) { | |
1234 | DEBUG_REQ(D_ERROR, req, "unwrap reply failed (%d):", rc); | |
0a3bdb00 | 1235 | return rc; |
d7e09d03 PT |
1236 | } |
1237 | ||
1238 | /* | |
1239 | * Security layer unwrap might ask resend this request. | |
1240 | */ | |
1241 | if (req->rq_resend) | |
0a3bdb00 | 1242 | return 0; |
d7e09d03 PT |
1243 | |
1244 | rc = unpack_reply(req); | |
1245 | if (rc) | |
0a3bdb00 | 1246 | return rc; |
d7e09d03 PT |
1247 | |
1248 | /* retry indefinitely on EINPROGRESS */ | |
1249 | if (lustre_msg_get_status(req->rq_repmsg) == -EINPROGRESS && | |
1250 | ptlrpc_no_resend(req) == 0 && !req->rq_no_retry_einprogress) { | |
7264b8a5 | 1251 | time_t now = get_seconds(); |
d7e09d03 PT |
1252 | |
1253 | DEBUG_REQ(D_RPCTRACE, req, "Resending request on EINPROGRESS"); | |
1254 | req->rq_resend = 1; | |
1255 | req->rq_nr_resend++; | |
1256 | ||
1257 | /* allocate new xid to avoid reply reconstruction */ | |
1258 | if (!req->rq_bulk) { | |
1259 | /* new xid is already allocated for bulk in | |
1260 | * ptlrpc_check_set() */ | |
1261 | req->rq_xid = ptlrpc_next_xid(); | |
1262 | DEBUG_REQ(D_RPCTRACE, req, "Allocating new xid for " | |
1263 | "resend on EINPROGRESS"); | |
1264 | } | |
1265 | ||
1266 | /* Readjust the timeout for current conditions */ | |
1267 | ptlrpc_at_set_req_timeout(req); | |
1268 | /* delay resend to give a chance to the server to get ready. | |
1269 | * The delay is increased by 1s on every resend and is capped to | |
1270 | * the current request timeout (i.e. obd_timeout if AT is off, | |
1271 | * or AT service time x 125% + 5s, see at_est2timeout) */ | |
1272 | if (req->rq_nr_resend > req->rq_timeout) | |
1273 | req->rq_sent = now + req->rq_timeout; | |
1274 | else | |
1275 | req->rq_sent = now + req->rq_nr_resend; | |
1276 | ||
0a3bdb00 | 1277 | return 0; |
d7e09d03 PT |
1278 | } |
1279 | ||
1280 | do_gettimeofday(&work_start); | |
1281 | timediff = cfs_timeval_sub(&work_start, &req->rq_arrival_time, NULL); | |
1282 | if (obd->obd_svc_stats != NULL) { | |
1283 | lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR, | |
1284 | timediff); | |
1285 | ptlrpc_lprocfs_rpc_sent(req, timediff); | |
1286 | } | |
1287 | ||
1288 | if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY && | |
1289 | lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) { | |
1290 | DEBUG_REQ(D_ERROR, req, "invalid packet received (type=%u)", | |
1291 | lustre_msg_get_type(req->rq_repmsg)); | |
0a3bdb00 | 1292 | return -EPROTO; |
d7e09d03 PT |
1293 | } |
1294 | ||
1295 | if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING) | |
1296 | CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, cfs_fail_val); | |
1297 | ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg)); | |
1298 | ptlrpc_at_adj_net_latency(req, | |
1299 | lustre_msg_get_service_time(req->rq_repmsg)); | |
1300 | ||
1301 | rc = ptlrpc_check_status(req); | |
1302 | imp->imp_connect_error = rc; | |
1303 | ||
1304 | if (rc) { | |
1305 | /* | |
1306 | * Either we've been evicted, or the server has failed for | |
1307 | * some reason. Try to reconnect, and if that fails, punt to | |
1308 | * the upcall. | |
1309 | */ | |
1310 | if (ll_rpc_recoverable_error(rc)) { | |
1311 | if (req->rq_send_state != LUSTRE_IMP_FULL || | |
1312 | imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) { | |
0a3bdb00 | 1313 | return rc; |
d7e09d03 PT |
1314 | } |
1315 | ptlrpc_request_handle_notconn(req); | |
0a3bdb00 | 1316 | return rc; |
d7e09d03 PT |
1317 | } |
1318 | } else { | |
1319 | /* | |
1320 | * Let's look if server sent slv. Do it only for RPC with | |
1321 | * rc == 0. | |
1322 | */ | |
1323 | ldlm_cli_update_pool(req); | |
1324 | } | |
1325 | ||
1326 | /* | |
1327 | * Store transno in reqmsg for replay. | |
1328 | */ | |
1329 | if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) { | |
1330 | req->rq_transno = lustre_msg_get_transno(req->rq_repmsg); | |
1331 | lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno); | |
1332 | } | |
1333 | ||
1334 | if (imp->imp_replayable) { | |
1335 | spin_lock(&imp->imp_lock); | |
1336 | /* | |
1337 | * No point in adding already-committed requests to the replay | |
1338 | * list, we will just remove them immediately. b=9829 | |
1339 | */ | |
1340 | if (req->rq_transno != 0 && | |
1341 | (req->rq_transno > | |
1342 | lustre_msg_get_last_committed(req->rq_repmsg) || | |
1343 | req->rq_replay)) { | |
1344 | /** version recovery */ | |
1345 | ptlrpc_save_versions(req); | |
1346 | ptlrpc_retain_replayable_request(req, imp); | |
503a1ac7 LZ |
1347 | } else if (req->rq_commit_cb != NULL && |
1348 | list_empty(&req->rq_replay_list)) { | |
1349 | /* NB: don't call rq_commit_cb if it's already on | |
1350 | * rq_replay_list, ptlrpc_free_committed() will call | |
1351 | * it later, see LU-3618 for details */ | |
d7e09d03 PT |
1352 | spin_unlock(&imp->imp_lock); |
1353 | req->rq_commit_cb(req); | |
1354 | spin_lock(&imp->imp_lock); | |
1355 | } | |
1356 | ||
1357 | /* | |
1358 | * Replay-enabled imports return commit-status information. | |
1359 | */ | |
1360 | if (lustre_msg_get_last_committed(req->rq_repmsg)) { | |
1361 | imp->imp_peer_committed_transno = | |
1362 | lustre_msg_get_last_committed(req->rq_repmsg); | |
1363 | } | |
1364 | ||
1365 | ptlrpc_free_committed(imp); | |
1366 | ||
1367 | if (!list_empty(&imp->imp_replay_list)) { | |
1368 | struct ptlrpc_request *last; | |
1369 | ||
1370 | last = list_entry(imp->imp_replay_list.prev, | |
1371 | struct ptlrpc_request, | |
1372 | rq_replay_list); | |
1373 | /* | |
1374 | * Requests with rq_replay stay on the list even if no | |
1375 | * commit is expected. | |
1376 | */ | |
1377 | if (last->rq_transno > imp->imp_peer_committed_transno) | |
1378 | ptlrpc_pinger_commit_expected(imp); | |
1379 | } | |
1380 | ||
1381 | spin_unlock(&imp->imp_lock); | |
1382 | } | |
1383 | ||
0a3bdb00 | 1384 | return rc; |
d7e09d03 PT |
1385 | } |
1386 | ||
1387 | /** | |
1388 | * Helper function to send request \a req over the network for the first time | |
1389 | * Also adjusts request phase. | |
1390 | * Returns 0 on success or error code. | |
1391 | */ | |
1392 | static int ptlrpc_send_new_req(struct ptlrpc_request *req) | |
1393 | { | |
1394 | struct obd_import *imp = req->rq_import; | |
1395 | int rc; | |
d7e09d03 PT |
1396 | |
1397 | LASSERT(req->rq_phase == RQ_PHASE_NEW); | |
7264b8a5 | 1398 | if (req->rq_sent && (req->rq_sent > get_seconds()) && |
d7e09d03 PT |
1399 | (!req->rq_generation_set || |
1400 | req->rq_import_generation == imp->imp_generation)) | |
0a3bdb00 | 1401 | return 0; |
d7e09d03 PT |
1402 | |
1403 | ptlrpc_rqphase_move(req, RQ_PHASE_RPC); | |
1404 | ||
1405 | spin_lock(&imp->imp_lock); | |
1406 | ||
1407 | if (!req->rq_generation_set) | |
1408 | req->rq_import_generation = imp->imp_generation; | |
1409 | ||
1410 | if (ptlrpc_import_delay_req(imp, req, &rc)) { | |
1411 | spin_lock(&req->rq_lock); | |
1412 | req->rq_waiting = 1; | |
1413 | spin_unlock(&req->rq_lock); | |
1414 | ||
1415 | DEBUG_REQ(D_HA, req, "req from PID %d waiting for recovery: " | |
1416 | "(%s != %s)", lustre_msg_get_status(req->rq_reqmsg), | |
1417 | ptlrpc_import_state_name(req->rq_send_state), | |
1418 | ptlrpc_import_state_name(imp->imp_state)); | |
1419 | LASSERT(list_empty(&req->rq_list)); | |
1420 | list_add_tail(&req->rq_list, &imp->imp_delayed_list); | |
1421 | atomic_inc(&req->rq_import->imp_inflight); | |
1422 | spin_unlock(&imp->imp_lock); | |
0a3bdb00 | 1423 | return 0; |
d7e09d03 PT |
1424 | } |
1425 | ||
1426 | if (rc != 0) { | |
1427 | spin_unlock(&imp->imp_lock); | |
1428 | req->rq_status = rc; | |
1429 | ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); | |
0a3bdb00 | 1430 | return rc; |
d7e09d03 PT |
1431 | } |
1432 | ||
1433 | LASSERT(list_empty(&req->rq_list)); | |
1434 | list_add_tail(&req->rq_list, &imp->imp_sending_list); | |
1435 | atomic_inc(&req->rq_import->imp_inflight); | |
1436 | spin_unlock(&imp->imp_lock); | |
1437 | ||
1438 | lustre_msg_set_status(req->rq_reqmsg, current_pid()); | |
1439 | ||
1440 | rc = sptlrpc_req_refresh_ctx(req, -1); | |
1441 | if (rc) { | |
1442 | if (req->rq_err) { | |
1443 | req->rq_status = rc; | |
0a3bdb00 | 1444 | return 1; |
d7e09d03 | 1445 | } else { |
15c50ccc | 1446 | spin_lock(&req->rq_lock); |
d7e09d03 | 1447 | req->rq_wait_ctx = 1; |
15c50ccc | 1448 | spin_unlock(&req->rq_lock); |
0a3bdb00 | 1449 | return 0; |
d7e09d03 PT |
1450 | } |
1451 | } | |
1452 | ||
1453 | CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:nid:opc" | |
b0f5aad5 | 1454 | " %s:%s:%d:%llu:%s:%d\n", current_comm(), |
d7e09d03 PT |
1455 | imp->imp_obd->obd_uuid.uuid, |
1456 | lustre_msg_get_status(req->rq_reqmsg), req->rq_xid, | |
1457 | libcfs_nid2str(imp->imp_connection->c_peer.nid), | |
1458 | lustre_msg_get_opc(req->rq_reqmsg)); | |
1459 | ||
1460 | rc = ptl_send_rpc(req, 0); | |
1461 | if (rc) { | |
1462 | DEBUG_REQ(D_HA, req, "send failed (%d); expect timeout", rc); | |
15c50ccc | 1463 | spin_lock(&req->rq_lock); |
d7e09d03 | 1464 | req->rq_net_err = 1; |
15c50ccc | 1465 | spin_unlock(&req->rq_lock); |
0a3bdb00 | 1466 | return rc; |
d7e09d03 | 1467 | } |
0a3bdb00 | 1468 | return 0; |
d7e09d03 PT |
1469 | } |
1470 | ||
1471 | static inline int ptlrpc_set_producer(struct ptlrpc_request_set *set) | |
1472 | { | |
1473 | int remaining, rc; | |
d7e09d03 PT |
1474 | |
1475 | LASSERT(set->set_producer != NULL); | |
1476 | ||
1477 | remaining = atomic_read(&set->set_remaining); | |
1478 | ||
1479 | /* populate the ->set_requests list with requests until we | |
1480 | * reach the maximum number of RPCs in flight for this set */ | |
1481 | while (atomic_read(&set->set_remaining) < set->set_max_inflight) { | |
1482 | rc = set->set_producer(set, set->set_producer_arg); | |
1483 | if (rc == -ENOENT) { | |
1484 | /* no more RPC to produce */ | |
1485 | set->set_producer = NULL; | |
1486 | set->set_producer_arg = NULL; | |
0a3bdb00 | 1487 | return 0; |
d7e09d03 PT |
1488 | } |
1489 | } | |
1490 | ||
0a3bdb00 | 1491 | return (atomic_read(&set->set_remaining) - remaining); |
d7e09d03 PT |
1492 | } |
1493 | ||
1494 | /** | |
1495 | * this sends any unsent RPCs in \a set and returns 1 if all are sent | |
1496 | * and no more replies are expected. | |
1497 | * (it is possible to get less replies than requests sent e.g. due to timed out | |
1498 | * requests or requests that we had trouble to send out) | |
da9e33c9 CM |
1499 | * |
1500 | * NOTE: This function contains a potential schedule point (cond_resched()). | |
d7e09d03 PT |
1501 | */ |
1502 | int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) | |
1503 | { | |
1504 | struct list_head *tmp, *next; | |
1505 | int force_timer_recalc = 0; | |
d7e09d03 PT |
1506 | |
1507 | if (atomic_read(&set->set_remaining) == 0) | |
0a3bdb00 | 1508 | return 1; |
d7e09d03 PT |
1509 | |
1510 | list_for_each_safe(tmp, next, &set->set_requests) { | |
1511 | struct ptlrpc_request *req = | |
1512 | list_entry(tmp, struct ptlrpc_request, | |
1513 | rq_set_chain); | |
1514 | struct obd_import *imp = req->rq_import; | |
1515 | int unregistered = 0; | |
1516 | int rc = 0; | |
1517 | ||
da9e33c9 CM |
1518 | /* This schedule point is mainly for the ptlrpcd caller of this |
1519 | * function. Most ptlrpc sets are not long-lived and unbounded | |
1520 | * in length, but at the least the set used by the ptlrpcd is. | |
1521 | * Since the processing time is unbounded, we need to insert an | |
1522 | * explicit schedule point to make the thread well-behaved. | |
1523 | */ | |
1524 | cond_resched(); | |
1525 | ||
d7e09d03 PT |
1526 | if (req->rq_phase == RQ_PHASE_NEW && |
1527 | ptlrpc_send_new_req(req)) { | |
1528 | force_timer_recalc = 1; | |
1529 | } | |
1530 | ||
1531 | /* delayed send - skip */ | |
1532 | if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent) | |
1533 | continue; | |
1534 | ||
1535 | /* delayed resend - skip */ | |
1536 | if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend && | |
7264b8a5 | 1537 | req->rq_sent > get_seconds()) |
d7e09d03 PT |
1538 | continue; |
1539 | ||
1540 | if (!(req->rq_phase == RQ_PHASE_RPC || | |
1541 | req->rq_phase == RQ_PHASE_BULK || | |
1542 | req->rq_phase == RQ_PHASE_INTERPRET || | |
1543 | req->rq_phase == RQ_PHASE_UNREGISTERING || | |
1544 | req->rq_phase == RQ_PHASE_COMPLETE)) { | |
1545 | DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase); | |
1546 | LBUG(); | |
1547 | } | |
1548 | ||
1549 | if (req->rq_phase == RQ_PHASE_UNREGISTERING) { | |
1550 | LASSERT(req->rq_next_phase != req->rq_phase); | |
1551 | LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED); | |
1552 | ||
1553 | /* | |
1554 | * Skip processing until reply is unlinked. We | |
1555 | * can't return to pool before that and we can't | |
1556 | * call interpret before that. We need to make | |
1557 | * sure that all rdma transfers finished and will | |
1558 | * not corrupt any data. | |
1559 | */ | |
1560 | if (ptlrpc_client_recv_or_unlink(req) || | |
1561 | ptlrpc_client_bulk_active(req)) | |
1562 | continue; | |
1563 | ||
1564 | /* | |
1565 | * Turn fail_loc off to prevent it from looping | |
1566 | * forever. | |
1567 | */ | |
1568 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) { | |
1569 | OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK, | |
1570 | OBD_FAIL_ONCE); | |
1571 | } | |
1572 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) { | |
1573 | OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK, | |
1574 | OBD_FAIL_ONCE); | |
1575 | } | |
1576 | ||
1577 | /* | |
1578 | * Move to next phase if reply was successfully | |
1579 | * unlinked. | |
1580 | */ | |
1581 | ptlrpc_rqphase_move(req, req->rq_next_phase); | |
1582 | } | |
1583 | ||
1584 | if (req->rq_phase == RQ_PHASE_COMPLETE) | |
1585 | continue; | |
1586 | ||
1587 | if (req->rq_phase == RQ_PHASE_INTERPRET) | |
a9b3e8f3 | 1588 | goto interpret; |
d7e09d03 PT |
1589 | |
1590 | /* | |
1591 | * Note that this also will start async reply unlink. | |
1592 | */ | |
1593 | if (req->rq_net_err && !req->rq_timedout) { | |
1594 | ptlrpc_expire_one_request(req, 1); | |
1595 | ||
1596 | /* | |
1597 | * Check if we still need to wait for unlink. | |
1598 | */ | |
1599 | if (ptlrpc_client_recv_or_unlink(req) || | |
1600 | ptlrpc_client_bulk_active(req)) | |
1601 | continue; | |
1602 | /* If there is no need to resend, fail it now. */ | |
1603 | if (req->rq_no_resend) { | |
1604 | if (req->rq_status == 0) | |
1605 | req->rq_status = -EIO; | |
1606 | ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); | |
a9b3e8f3 | 1607 | goto interpret; |
d7e09d03 PT |
1608 | } else { |
1609 | continue; | |
1610 | } | |
1611 | } | |
1612 | ||
1613 | if (req->rq_err) { | |
1614 | spin_lock(&req->rq_lock); | |
1615 | req->rq_replied = 0; | |
1616 | spin_unlock(&req->rq_lock); | |
1617 | if (req->rq_status == 0) | |
1618 | req->rq_status = -EIO; | |
1619 | ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); | |
a9b3e8f3 | 1620 | goto interpret; |
d7e09d03 PT |
1621 | } |
1622 | ||
1623 | /* ptlrpc_set_wait->l_wait_event sets lwi_allow_intr | |
1624 | * so it sets rq_intr regardless of individual rpc | |
1625 | * timeouts. The synchronous IO waiting path sets | |
1626 | * rq_intr irrespective of whether ptlrpcd | |
1627 | * has seen a timeout. Our policy is to only interpret | |
1628 | * interrupted rpcs after they have timed out, so we | |
1629 | * need to enforce that here. | |
1630 | */ | |
1631 | ||
1632 | if (req->rq_intr && (req->rq_timedout || req->rq_waiting || | |
1633 | req->rq_wait_ctx)) { | |
1634 | req->rq_status = -EINTR; | |
1635 | ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); | |
a9b3e8f3 | 1636 | goto interpret; |
d7e09d03 PT |
1637 | } |
1638 | ||
1639 | if (req->rq_phase == RQ_PHASE_RPC) { | |
1640 | if (req->rq_timedout || req->rq_resend || | |
1641 | req->rq_waiting || req->rq_wait_ctx) { | |
1642 | int status; | |
1643 | ||
1644 | if (!ptlrpc_unregister_reply(req, 1)) | |
1645 | continue; | |
1646 | ||
1647 | spin_lock(&imp->imp_lock); | |
cb68dd2d KM |
1648 | if (ptlrpc_import_delay_req(imp, req, |
1649 | &status)) { | |
d7e09d03 PT |
1650 | /* put on delay list - only if we wait |
1651 | * recovery finished - before send */ | |
1652 | list_del_init(&req->rq_list); | |
1653 | list_add_tail(&req->rq_list, | |
1654 | &imp-> | |
1655 | imp_delayed_list); | |
1656 | spin_unlock(&imp->imp_lock); | |
1657 | continue; | |
1658 | } | |
1659 | ||
1660 | if (status != 0) { | |
1661 | req->rq_status = status; | |
1662 | ptlrpc_rqphase_move(req, | |
1663 | RQ_PHASE_INTERPRET); | |
1664 | spin_unlock(&imp->imp_lock); | |
a9b3e8f3 | 1665 | goto interpret; |
d7e09d03 PT |
1666 | } |
1667 | if (ptlrpc_no_resend(req) && | |
1668 | !req->rq_wait_ctx) { | |
1669 | req->rq_status = -ENOTCONN; | |
1670 | ptlrpc_rqphase_move(req, | |
1671 | RQ_PHASE_INTERPRET); | |
1672 | spin_unlock(&imp->imp_lock); | |
a9b3e8f3 | 1673 | goto interpret; |
d7e09d03 PT |
1674 | } |
1675 | ||
1676 | list_del_init(&req->rq_list); | |
1677 | list_add_tail(&req->rq_list, | |
1678 | &imp->imp_sending_list); | |
1679 | ||
1680 | spin_unlock(&imp->imp_lock); | |
1681 | ||
1682 | spin_lock(&req->rq_lock); | |
1683 | req->rq_waiting = 0; | |
1684 | spin_unlock(&req->rq_lock); | |
1685 | ||
1686 | if (req->rq_timedout || req->rq_resend) { | |
1687 | /* This is re-sending anyways, | |
1688 | * let's mark req as resend. */ | |
1689 | spin_lock(&req->rq_lock); | |
1690 | req->rq_resend = 1; | |
1691 | spin_unlock(&req->rq_lock); | |
1692 | if (req->rq_bulk) { | |
1693 | __u64 old_xid; | |
1694 | ||
1695 | if (!ptlrpc_unregister_bulk(req, 1)) | |
1696 | continue; | |
1697 | ||
1698 | /* ensure previous bulk fails */ | |
1699 | old_xid = req->rq_xid; | |
1700 | req->rq_xid = ptlrpc_next_xid(); | |
b0f5aad5 | 1701 | CDEBUG(D_HA, "resend bulk old x%llu new x%llu\n", |
d7e09d03 PT |
1702 | old_xid, req->rq_xid); |
1703 | } | |
1704 | } | |
1705 | /* | |
1706 | * rq_wait_ctx is only touched by ptlrpcd, | |
1707 | * so no lock is needed here. | |
1708 | */ | |
1709 | status = sptlrpc_req_refresh_ctx(req, -1); | |
1710 | if (status) { | |
1711 | if (req->rq_err) { | |
1712 | req->rq_status = status; | |
1713 | spin_lock(&req->rq_lock); | |
1714 | req->rq_wait_ctx = 0; | |
1715 | spin_unlock(&req->rq_lock); | |
1716 | force_timer_recalc = 1; | |
1717 | } else { | |
1718 | spin_lock(&req->rq_lock); | |
1719 | req->rq_wait_ctx = 1; | |
1720 | spin_unlock(&req->rq_lock); | |
1721 | } | |
1722 | ||
1723 | continue; | |
1724 | } else { | |
1725 | spin_lock(&req->rq_lock); | |
1726 | req->rq_wait_ctx = 0; | |
1727 | spin_unlock(&req->rq_lock); | |
1728 | } | |
1729 | ||
1730 | rc = ptl_send_rpc(req, 0); | |
1731 | if (rc) { | |
1732 | DEBUG_REQ(D_HA, req, | |
1733 | "send failed: rc = %d", rc); | |
1734 | force_timer_recalc = 1; | |
1735 | spin_lock(&req->rq_lock); | |
1736 | req->rq_net_err = 1; | |
1737 | spin_unlock(&req->rq_lock); | |
e3bceb23 | 1738 | continue; |
d7e09d03 PT |
1739 | } |
1740 | /* need to reset the timeout */ | |
1741 | force_timer_recalc = 1; | |
1742 | } | |
1743 | ||
1744 | spin_lock(&req->rq_lock); | |
1745 | ||
1746 | if (ptlrpc_client_early(req)) { | |
1747 | ptlrpc_at_recv_early_reply(req); | |
1748 | spin_unlock(&req->rq_lock); | |
1749 | continue; | |
1750 | } | |
1751 | ||
1752 | /* Still waiting for a reply? */ | |
1753 | if (ptlrpc_client_recv(req)) { | |
1754 | spin_unlock(&req->rq_lock); | |
1755 | continue; | |
1756 | } | |
1757 | ||
1758 | /* Did we actually receive a reply? */ | |
1759 | if (!ptlrpc_client_replied(req)) { | |
1760 | spin_unlock(&req->rq_lock); | |
1761 | continue; | |
1762 | } | |
1763 | ||
1764 | spin_unlock(&req->rq_lock); | |
1765 | ||
1766 | /* unlink from net because we are going to | |
1767 | * swab in-place of reply buffer */ | |
1768 | unregistered = ptlrpc_unregister_reply(req, 1); | |
1769 | if (!unregistered) | |
1770 | continue; | |
1771 | ||
1772 | req->rq_status = after_reply(req); | |
1773 | if (req->rq_resend) | |
1774 | continue; | |
1775 | ||
1776 | /* If there is no bulk associated with this request, | |
1777 | * then we're done and should let the interpreter | |
1778 | * process the reply. Similarly if the RPC returned | |
1779 | * an error, and therefore the bulk will never arrive. | |
1780 | */ | |
1781 | if (req->rq_bulk == NULL || req->rq_status < 0) { | |
1782 | ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); | |
a9b3e8f3 | 1783 | goto interpret; |
d7e09d03 PT |
1784 | } |
1785 | ||
1786 | ptlrpc_rqphase_move(req, RQ_PHASE_BULK); | |
1787 | } | |
1788 | ||
1789 | LASSERT(req->rq_phase == RQ_PHASE_BULK); | |
1790 | if (ptlrpc_client_bulk_active(req)) | |
1791 | continue; | |
1792 | ||
1793 | if (req->rq_bulk->bd_failure) { | |
1794 | /* The RPC reply arrived OK, but the bulk screwed | |
1795 | * up! Dead weird since the server told us the RPC | |
1796 | * was good after getting the REPLY for her GET or | |
1797 | * the ACK for her PUT. */ | |
1798 | DEBUG_REQ(D_ERROR, req, "bulk transfer failed"); | |
1799 | req->rq_status = -EIO; | |
1800 | } | |
1801 | ||
1802 | ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); | |
1803 | ||
7f1d15a8 | 1804 | interpret: |
d7e09d03 PT |
1805 | LASSERT(req->rq_phase == RQ_PHASE_INTERPRET); |
1806 | ||
1807 | /* This moves to "unregistering" phase we need to wait for | |
1808 | * reply unlink. */ | |
1809 | if (!unregistered && !ptlrpc_unregister_reply(req, 1)) { | |
1810 | /* start async bulk unlink too */ | |
1811 | ptlrpc_unregister_bulk(req, 1); | |
1812 | continue; | |
1813 | } | |
1814 | ||
1815 | if (!ptlrpc_unregister_bulk(req, 1)) | |
1816 | continue; | |
1817 | ||
1818 | /* When calling interpret receiving already should be | |
1819 | * finished. */ | |
1820 | LASSERT(!req->rq_receiving_reply); | |
1821 | ||
1822 | ptlrpc_req_interpret(env, req, req->rq_status); | |
1823 | ||
82a373ae LZ |
1824 | if (ptlrpcd_check_work(req)) { |
1825 | atomic_dec(&set->set_remaining); | |
1826 | continue; | |
1827 | } | |
d7e09d03 PT |
1828 | ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE); |
1829 | ||
1830 | CDEBUG(req->rq_reqmsg != NULL ? D_RPCTRACE : 0, | |
1831 | "Completed RPC pname:cluuid:pid:xid:nid:" | |
b0f5aad5 | 1832 | "opc %s:%s:%d:%llu:%s:%d\n", |
d7e09d03 PT |
1833 | current_comm(), imp->imp_obd->obd_uuid.uuid, |
1834 | lustre_msg_get_status(req->rq_reqmsg), req->rq_xid, | |
1835 | libcfs_nid2str(imp->imp_connection->c_peer.nid), | |
1836 | lustre_msg_get_opc(req->rq_reqmsg)); | |
1837 | ||
1838 | spin_lock(&imp->imp_lock); | |
1839 | /* Request already may be not on sending or delaying list. This | |
1840 | * may happen in the case of marking it erroneous for the case | |
1841 | * ptlrpc_import_delay_req(req, status) find it impossible to | |
1842 | * allow sending this rpc and returns *status != 0. */ | |
1843 | if (!list_empty(&req->rq_list)) { | |
1844 | list_del_init(&req->rq_list); | |
1845 | atomic_dec(&imp->imp_inflight); | |
1846 | } | |
1847 | spin_unlock(&imp->imp_lock); | |
1848 | ||
1849 | atomic_dec(&set->set_remaining); | |
1850 | wake_up_all(&imp->imp_recovery_waitq); | |
1851 | ||
1852 | if (set->set_producer) { | |
1853 | /* produce a new request if possible */ | |
1854 | if (ptlrpc_set_producer(set) > 0) | |
1855 | force_timer_recalc = 1; | |
1856 | ||
1857 | /* free the request that has just been completed | |
1858 | * in order not to pollute set->set_requests */ | |
1859 | list_del_init(&req->rq_set_chain); | |
1860 | spin_lock(&req->rq_lock); | |
1861 | req->rq_set = NULL; | |
1862 | req->rq_invalid_rqset = 0; | |
1863 | spin_unlock(&req->rq_lock); | |
1864 | ||
1865 | /* record rq_status to compute the final status later */ | |
1866 | if (req->rq_status != 0) | |
1867 | set->set_rc = req->rq_status; | |
1868 | ptlrpc_req_finished(req); | |
1869 | } | |
1870 | } | |
1871 | ||
1872 | /* If we hit an error, we want to recover promptly. */ | |
0a3bdb00 | 1873 | return atomic_read(&set->set_remaining) == 0 || force_timer_recalc; |
d7e09d03 PT |
1874 | } |
1875 | EXPORT_SYMBOL(ptlrpc_check_set); | |
1876 | ||
1877 | /** | |
1878 | * Time out request \a req. is \a async_unlink is set, that means do not wait | |
1879 | * until LNet actually confirms network buffer unlinking. | |
1880 | * Return 1 if we should give up further retrying attempts or 0 otherwise. | |
1881 | */ | |
1882 | int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink) | |
1883 | { | |
1884 | struct obd_import *imp = req->rq_import; | |
1885 | int rc = 0; | |
d7e09d03 PT |
1886 | |
1887 | spin_lock(&req->rq_lock); | |
1888 | req->rq_timedout = 1; | |
1889 | spin_unlock(&req->rq_lock); | |
1890 | ||
1891 | DEBUG_REQ(D_WARNING, req, "Request sent has %s: [sent "CFS_DURATION_T | |
1892 | "/real "CFS_DURATION_T"]", | |
1893 | req->rq_net_err ? "failed due to network error" : | |
1894 | ((req->rq_real_sent == 0 || | |
699503bc | 1895 | time_before((unsigned long)req->rq_real_sent, (unsigned long)req->rq_sent) || |
d7e09d03 PT |
1896 | cfs_time_aftereq(req->rq_real_sent, req->rq_deadline)) ? |
1897 | "timed out for sent delay" : "timed out for slow reply"), | |
1898 | req->rq_sent, req->rq_real_sent); | |
1899 | ||
1900 | if (imp != NULL && obd_debug_peer_on_timeout) | |
1901 | LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer); | |
1902 | ||
1903 | ptlrpc_unregister_reply(req, async_unlink); | |
1904 | ptlrpc_unregister_bulk(req, async_unlink); | |
1905 | ||
1906 | if (obd_dump_on_timeout) | |
1907 | libcfs_debug_dumplog(); | |
1908 | ||
1909 | if (imp == NULL) { | |
1910 | DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?"); | |
0a3bdb00 | 1911 | return 1; |
d7e09d03 PT |
1912 | } |
1913 | ||
1914 | atomic_inc(&imp->imp_timeouts); | |
1915 | ||
1916 | /* The DLM server doesn't want recovery run on its imports. */ | |
1917 | if (imp->imp_dlm_fake) | |
0a3bdb00 | 1918 | return 1; |
d7e09d03 PT |
1919 | |
1920 | /* If this request is for recovery or other primordial tasks, | |
1921 | * then error it out here. */ | |
1922 | if (req->rq_ctx_init || req->rq_ctx_fini || | |
1923 | req->rq_send_state != LUSTRE_IMP_FULL || | |
1924 | imp->imp_obd->obd_no_recov) { | |
1925 | DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)", | |
1926 | ptlrpc_import_state_name(req->rq_send_state), | |
1927 | ptlrpc_import_state_name(imp->imp_state)); | |
1928 | spin_lock(&req->rq_lock); | |
1929 | req->rq_status = -ETIMEDOUT; | |
1930 | req->rq_err = 1; | |
1931 | spin_unlock(&req->rq_lock); | |
0a3bdb00 | 1932 | return 1; |
d7e09d03 PT |
1933 | } |
1934 | ||
1935 | /* if a request can't be resent we can't wait for an answer after | |
1936 | the timeout */ | |
1937 | if (ptlrpc_no_resend(req)) { | |
1938 | DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:"); | |
1939 | rc = 1; | |
1940 | } | |
1941 | ||
1942 | ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg)); | |
1943 | ||
0a3bdb00 | 1944 | return rc; |
d7e09d03 PT |
1945 | } |
1946 | ||
1947 | /** | |
1948 | * Time out all uncompleted requests in request set pointed by \a data | |
1949 | * Callback used when waiting on sets with l_wait_event. | |
1950 | * Always returns 1. | |
1951 | */ | |
1952 | int ptlrpc_expired_set(void *data) | |
1953 | { | |
1954 | struct ptlrpc_request_set *set = data; | |
1955 | struct list_head *tmp; | |
7264b8a5 | 1956 | time_t now = get_seconds(); |
d7e09d03 PT |
1957 | |
1958 | LASSERT(set != NULL); | |
1959 | ||
1960 | /* | |
1961 | * A timeout expired. See which reqs it applies to... | |
1962 | */ | |
3949015e | 1963 | list_for_each(tmp, &set->set_requests) { |
d7e09d03 PT |
1964 | struct ptlrpc_request *req = |
1965 | list_entry(tmp, struct ptlrpc_request, | |
1966 | rq_set_chain); | |
1967 | ||
1968 | /* don't expire request waiting for context */ | |
1969 | if (req->rq_wait_ctx) | |
1970 | continue; | |
1971 | ||
1972 | /* Request in-flight? */ | |
1973 | if (!((req->rq_phase == RQ_PHASE_RPC && | |
1974 | !req->rq_waiting && !req->rq_resend) || | |
1975 | (req->rq_phase == RQ_PHASE_BULK))) | |
1976 | continue; | |
1977 | ||
1978 | if (req->rq_timedout || /* already dealt with */ | |
1979 | req->rq_deadline > now) /* not expired */ | |
1980 | continue; | |
1981 | ||
1982 | /* Deal with this guy. Do it asynchronously to not block | |
1983 | * ptlrpcd thread. */ | |
1984 | ptlrpc_expire_one_request(req, 1); | |
1985 | } | |
1986 | ||
1987 | /* | |
1988 | * When waiting for a whole set, we always break out of the | |
1989 | * sleep so we can recalculate the timeout, or enable interrupts | |
1990 | * if everyone's timed out. | |
1991 | */ | |
0a3bdb00 | 1992 | return 1; |
d7e09d03 PT |
1993 | } |
1994 | EXPORT_SYMBOL(ptlrpc_expired_set); | |
1995 | ||
1996 | /** | |
1997 | * Sets rq_intr flag in \a req under spinlock. | |
1998 | */ | |
1999 | void ptlrpc_mark_interrupted(struct ptlrpc_request *req) | |
2000 | { | |
2001 | spin_lock(&req->rq_lock); | |
2002 | req->rq_intr = 1; | |
2003 | spin_unlock(&req->rq_lock); | |
2004 | } | |
2005 | EXPORT_SYMBOL(ptlrpc_mark_interrupted); | |
2006 | ||
2007 | /** | |
2008 | * Interrupts (sets interrupted flag) all uncompleted requests in | |
2009 | * a set \a data. Callback for l_wait_event for interruptible waits. | |
2010 | */ | |
2011 | void ptlrpc_interrupted_set(void *data) | |
2012 | { | |
2013 | struct ptlrpc_request_set *set = data; | |
2014 | struct list_head *tmp; | |
2015 | ||
2016 | LASSERT(set != NULL); | |
2017 | CDEBUG(D_RPCTRACE, "INTERRUPTED SET %p\n", set); | |
2018 | ||
2019 | list_for_each(tmp, &set->set_requests) { | |
2020 | struct ptlrpc_request *req = | |
2021 | list_entry(tmp, struct ptlrpc_request, | |
2022 | rq_set_chain); | |
2023 | ||
2024 | if (req->rq_phase != RQ_PHASE_RPC && | |
2025 | req->rq_phase != RQ_PHASE_UNREGISTERING) | |
2026 | continue; | |
2027 | ||
2028 | ptlrpc_mark_interrupted(req); | |
2029 | } | |
2030 | } | |
2031 | EXPORT_SYMBOL(ptlrpc_interrupted_set); | |
2032 | ||
2033 | /** | |
2034 | * Get the smallest timeout in the set; this does NOT set a timeout. | |
2035 | */ | |
2036 | int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set) | |
2037 | { | |
2038 | struct list_head *tmp; | |
7264b8a5 | 2039 | time_t now = get_seconds(); |
d7e09d03 PT |
2040 | int timeout = 0; |
2041 | struct ptlrpc_request *req; | |
2042 | int deadline; | |
d7e09d03 | 2043 | |
d7e09d03 PT |
2044 | list_for_each(tmp, &set->set_requests) { |
2045 | req = list_entry(tmp, struct ptlrpc_request, rq_set_chain); | |
2046 | ||
2047 | /* | |
2048 | * Request in-flight? | |
2049 | */ | |
2050 | if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) || | |
2051 | (req->rq_phase == RQ_PHASE_BULK) || | |
2052 | (req->rq_phase == RQ_PHASE_NEW))) | |
2053 | continue; | |
2054 | ||
2055 | /* | |
2056 | * Already timed out. | |
2057 | */ | |
2058 | if (req->rq_timedout) | |
2059 | continue; | |
2060 | ||
2061 | /* | |
2062 | * Waiting for ctx. | |
2063 | */ | |
2064 | if (req->rq_wait_ctx) | |
2065 | continue; | |
2066 | ||
2067 | if (req->rq_phase == RQ_PHASE_NEW) | |
2068 | deadline = req->rq_sent; | |
2069 | else if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend) | |
2070 | deadline = req->rq_sent; | |
2071 | else | |
2072 | deadline = req->rq_sent + req->rq_timeout; | |
2073 | ||
2074 | if (deadline <= now) /* actually expired already */ | |
2075 | timeout = 1; /* ASAP */ | |
2076 | else if (timeout == 0 || timeout > deadline - now) | |
2077 | timeout = deadline - now; | |
2078 | } | |
0a3bdb00 | 2079 | return timeout; |
d7e09d03 PT |
2080 | } |
2081 | EXPORT_SYMBOL(ptlrpc_set_next_timeout); | |
2082 | ||
2083 | /** | |
930cef9a | 2084 | * Send all unset request from the set and then wait until all |
d7e09d03 PT |
2085 | * requests in the set complete (either get a reply, timeout, get an |
2086 | * error or otherwise be interrupted). | |
2087 | * Returns 0 on success or error code otherwise. | |
2088 | */ | |
2089 | int ptlrpc_set_wait(struct ptlrpc_request_set *set) | |
2090 | { | |
2091 | struct list_head *tmp; | |
2092 | struct ptlrpc_request *req; | |
2093 | struct l_wait_info lwi; | |
2094 | int rc, timeout; | |
d7e09d03 PT |
2095 | |
2096 | if (set->set_producer) | |
2097 | (void)ptlrpc_set_producer(set); | |
2098 | else | |
2099 | list_for_each(tmp, &set->set_requests) { | |
2100 | req = list_entry(tmp, struct ptlrpc_request, | |
2101 | rq_set_chain); | |
2102 | if (req->rq_phase == RQ_PHASE_NEW) | |
2103 | (void)ptlrpc_send_new_req(req); | |
2104 | } | |
2105 | ||
2106 | if (list_empty(&set->set_requests)) | |
0a3bdb00 | 2107 | return 0; |
d7e09d03 PT |
2108 | |
2109 | do { | |
2110 | timeout = ptlrpc_set_next_timeout(set); | |
2111 | ||
2112 | /* wait until all complete, interrupted, or an in-flight | |
2113 | * req times out */ | |
2114 | CDEBUG(D_RPCTRACE, "set %p going to sleep for %d seconds\n", | |
2115 | set, timeout); | |
2116 | ||
2117 | if (timeout == 0 && !cfs_signal_pending()) | |
2118 | /* | |
2119 | * No requests are in-flight (ether timed out | |
2120 | * or delayed), so we can allow interrupts. | |
2121 | * We still want to block for a limited time, | |
2122 | * so we allow interrupts during the timeout. | |
2123 | */ | |
2124 | lwi = LWI_TIMEOUT_INTR_ALL(cfs_time_seconds(1), | |
2125 | ptlrpc_expired_set, | |
2126 | ptlrpc_interrupted_set, set); | |
2127 | else | |
2128 | /* | |
2129 | * At least one request is in flight, so no | |
2130 | * interrupts are allowed. Wait until all | |
2131 | * complete, or an in-flight req times out. | |
2132 | */ | |
0ae015be | 2133 | lwi = LWI_TIMEOUT(cfs_time_seconds(timeout ? timeout : 1), |
d7e09d03 PT |
2134 | ptlrpc_expired_set, set); |
2135 | ||
2136 | rc = l_wait_event(set->set_waitq, ptlrpc_check_set(NULL, set), &lwi); | |
2137 | ||
2138 | /* LU-769 - if we ignored the signal because it was already | |
2139 | * pending when we started, we need to handle it now or we risk | |
2140 | * it being ignored forever */ | |
2141 | if (rc == -ETIMEDOUT && !lwi.lwi_allow_intr && | |
2142 | cfs_signal_pending()) { | |
2143 | sigset_t blocked_sigs = | |
2144 | cfs_block_sigsinv(LUSTRE_FATAL_SIGS); | |
2145 | ||
2146 | /* In fact we only interrupt for the "fatal" signals | |
2147 | * like SIGINT or SIGKILL. We still ignore less | |
2148 | * important signals since ptlrpc set is not easily | |
2149 | * reentrant from userspace again */ | |
2150 | if (cfs_signal_pending()) | |
2151 | ptlrpc_interrupted_set(set); | |
2152 | cfs_restore_sigs(blocked_sigs); | |
2153 | } | |
2154 | ||
2155 | LASSERT(rc == 0 || rc == -EINTR || rc == -ETIMEDOUT); | |
2156 | ||
2157 | /* -EINTR => all requests have been flagged rq_intr so next | |
2158 | * check completes. | |
2159 | * -ETIMEDOUT => someone timed out. When all reqs have | |
2160 | * timed out, signals are enabled allowing completion with | |
2161 | * EINTR. | |
2162 | * I don't really care if we go once more round the loop in | |
2163 | * the error cases -eeb. */ | |
2164 | if (rc == 0 && atomic_read(&set->set_remaining) == 0) { | |
2165 | list_for_each(tmp, &set->set_requests) { | |
2166 | req = list_entry(tmp, struct ptlrpc_request, | |
2167 | rq_set_chain); | |
2168 | spin_lock(&req->rq_lock); | |
2169 | req->rq_invalid_rqset = 1; | |
2170 | spin_unlock(&req->rq_lock); | |
2171 | } | |
2172 | } | |
2173 | } while (rc != 0 || atomic_read(&set->set_remaining) != 0); | |
2174 | ||
2175 | LASSERT(atomic_read(&set->set_remaining) == 0); | |
2176 | ||
2177 | rc = set->set_rc; /* rq_status of already freed requests if any */ | |
2178 | list_for_each(tmp, &set->set_requests) { | |
2179 | req = list_entry(tmp, struct ptlrpc_request, rq_set_chain); | |
2180 | ||
2181 | LASSERT(req->rq_phase == RQ_PHASE_COMPLETE); | |
2182 | if (req->rq_status != 0) | |
2183 | rc = req->rq_status; | |
2184 | } | |
2185 | ||
2186 | if (set->set_interpret != NULL) { | |
0ae015be | 2187 | int (*interpreter)(struct ptlrpc_request_set *set, void *, int) = |
d7e09d03 PT |
2188 | set->set_interpret; |
2189 | rc = interpreter (set, set->set_arg, rc); | |
2190 | } else { | |
2191 | struct ptlrpc_set_cbdata *cbdata, *n; | |
2192 | int err; | |
2193 | ||
2194 | list_for_each_entry_safe(cbdata, n, | |
2195 | &set->set_cblist, psc_item) { | |
2196 | list_del_init(&cbdata->psc_item); | |
2197 | err = cbdata->psc_interpret(set, cbdata->psc_data, rc); | |
2198 | if (err && !rc) | |
2199 | rc = err; | |
2200 | OBD_FREE_PTR(cbdata); | |
2201 | } | |
2202 | } | |
2203 | ||
0a3bdb00 | 2204 | return rc; |
d7e09d03 PT |
2205 | } |
2206 | EXPORT_SYMBOL(ptlrpc_set_wait); | |
2207 | ||
2208 | /** | |
930cef9a | 2209 | * Helper function for request freeing. |
d7e09d03 PT |
2210 | * Called when request count reached zero and request needs to be freed. |
2211 | * Removes request from all sorts of sending/replay lists it might be on, | |
2212 | * frees network buffers if any are present. | |
2213 | * If \a locked is set, that means caller is already holding import imp_lock | |
2214 | * and so we no longer need to reobtain it (for certain lists manipulations) | |
2215 | */ | |
2216 | static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked) | |
2217 | { | |
3ff28049 | 2218 | if (request == NULL) |
d7e09d03 | 2219 | return; |
d7e09d03 | 2220 | LASSERTF(!request->rq_receiving_reply, "req %p\n", request); |
0ae015be | 2221 | LASSERTF(request->rq_rqbd == NULL, "req %p\n", request);/* client-side */ |
d7e09d03 PT |
2222 | LASSERTF(list_empty(&request->rq_list), "req %p\n", request); |
2223 | LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request); | |
2224 | LASSERTF(list_empty(&request->rq_exp_list), "req %p\n", request); | |
2225 | LASSERTF(!request->rq_replay, "req %p\n", request); | |
2226 | ||
2227 | req_capsule_fini(&request->rq_pill); | |
2228 | ||
2229 | /* We must take it off the imp_replay_list first. Otherwise, we'll set | |
2230 | * request->rq_reqmsg to NULL while osc_close is dereferencing it. */ | |
2231 | if (request->rq_import != NULL) { | |
2232 | if (!locked) | |
2233 | spin_lock(&request->rq_import->imp_lock); | |
2234 | list_del_init(&request->rq_replay_list); | |
2235 | if (!locked) | |
2236 | spin_unlock(&request->rq_import->imp_lock); | |
2237 | } | |
2238 | LASSERTF(list_empty(&request->rq_replay_list), "req %p\n", request); | |
2239 | ||
2240 | if (atomic_read(&request->rq_refcount) != 0) { | |
2241 | DEBUG_REQ(D_ERROR, request, | |
2242 | "freeing request with nonzero refcount"); | |
2243 | LBUG(); | |
2244 | } | |
2245 | ||
2246 | if (request->rq_repbuf != NULL) | |
2247 | sptlrpc_cli_free_repbuf(request); | |
2248 | if (request->rq_export != NULL) { | |
2249 | class_export_put(request->rq_export); | |
2250 | request->rq_export = NULL; | |
2251 | } | |
2252 | if (request->rq_import != NULL) { | |
2253 | class_import_put(request->rq_import); | |
2254 | request->rq_import = NULL; | |
2255 | } | |
2256 | if (request->rq_bulk != NULL) | |
2257 | ptlrpc_free_bulk_pin(request->rq_bulk); | |
2258 | ||
2259 | if (request->rq_reqbuf != NULL || request->rq_clrbuf != NULL) | |
2260 | sptlrpc_cli_free_reqbuf(request); | |
2261 | ||
2262 | if (request->rq_cli_ctx) | |
2263 | sptlrpc_req_put_ctx(request, !locked); | |
2264 | ||
2265 | if (request->rq_pool) | |
2266 | __ptlrpc_free_req_to_pool(request); | |
2267 | else | |
35b2e1b7 | 2268 | ptlrpc_request_cache_free(request); |
d7e09d03 PT |
2269 | } |
2270 | ||
2271 | static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked); | |
2272 | /** | |
2273 | * Drop one request reference. Must be called with import imp_lock held. | |
930cef9a | 2274 | * When reference count drops to zero, request is freed. |
d7e09d03 PT |
2275 | */ |
2276 | void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request) | |
2277 | { | |
5e42bc9d | 2278 | assert_spin_locked(&request->rq_import->imp_lock); |
d7e09d03 PT |
2279 | (void)__ptlrpc_req_finished(request, 1); |
2280 | } | |
2281 | EXPORT_SYMBOL(ptlrpc_req_finished_with_imp_lock); | |
2282 | ||
2283 | /** | |
2284 | * Helper function | |
2285 | * Drops one reference count for request \a request. | |
2286 | * \a locked set indicates that caller holds import imp_lock. | |
930cef9a | 2287 | * Frees the request when reference count reaches zero. |
d7e09d03 PT |
2288 | */ |
2289 | static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked) | |
2290 | { | |
d7e09d03 | 2291 | if (request == NULL) |
0a3bdb00 | 2292 | return 1; |
d7e09d03 PT |
2293 | |
2294 | if (request == LP_POISON || | |
2295 | request->rq_reqmsg == LP_POISON) { | |
2296 | CERROR("dereferencing freed request (bug 575)\n"); | |
2297 | LBUG(); | |
0a3bdb00 | 2298 | return 1; |
d7e09d03 PT |
2299 | } |
2300 | ||
2301 | DEBUG_REQ(D_INFO, request, "refcount now %u", | |
2302 | atomic_read(&request->rq_refcount) - 1); | |
2303 | ||
2304 | if (atomic_dec_and_test(&request->rq_refcount)) { | |
2305 | __ptlrpc_free_req(request, locked); | |
0a3bdb00 | 2306 | return 1; |
d7e09d03 PT |
2307 | } |
2308 | ||
0a3bdb00 | 2309 | return 0; |
d7e09d03 PT |
2310 | } |
2311 | ||
2312 | /** | |
2313 | * Drops one reference count for a request. | |
2314 | */ | |
2315 | void ptlrpc_req_finished(struct ptlrpc_request *request) | |
2316 | { | |
2317 | __ptlrpc_req_finished(request, 0); | |
2318 | } | |
2319 | EXPORT_SYMBOL(ptlrpc_req_finished); | |
2320 | ||
2321 | /** | |
2322 | * Returns xid of a \a request | |
2323 | */ | |
2324 | __u64 ptlrpc_req_xid(struct ptlrpc_request *request) | |
2325 | { | |
2326 | return request->rq_xid; | |
2327 | } | |
2328 | EXPORT_SYMBOL(ptlrpc_req_xid); | |
2329 | ||
2330 | /** | |
2331 | * Disengage the client's reply buffer from the network | |
2332 | * NB does _NOT_ unregister any client-side bulk. | |
2333 | * IDEMPOTENT, but _not_ safe against concurrent callers. | |
2334 | * The request owner (i.e. the thread doing the I/O) must call... | |
2335 | * Returns 0 on success or 1 if unregistering cannot be made. | |
2336 | */ | |
2337 | int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async) | |
2338 | { | |
2339 | int rc; | |
2340 | wait_queue_head_t *wq; | |
2341 | struct l_wait_info lwi; | |
2342 | ||
2343 | /* | |
2344 | * Might sleep. | |
2345 | */ | |
2346 | LASSERT(!in_interrupt()); | |
2347 | ||
2348 | /* | |
2349 | * Let's setup deadline for reply unlink. | |
2350 | */ | |
2351 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && | |
2352 | async && request->rq_reply_deadline == 0) | |
7264b8a5 | 2353 | request->rq_reply_deadline = get_seconds()+LONG_UNLINK; |
d7e09d03 PT |
2354 | |
2355 | /* | |
2356 | * Nothing left to do. | |
2357 | */ | |
2358 | if (!ptlrpc_client_recv_or_unlink(request)) | |
0a3bdb00 | 2359 | return 1; |
d7e09d03 PT |
2360 | |
2361 | LNetMDUnlink(request->rq_reply_md_h); | |
2362 | ||
2363 | /* | |
2364 | * Let's check it once again. | |
2365 | */ | |
2366 | if (!ptlrpc_client_recv_or_unlink(request)) | |
0a3bdb00 | 2367 | return 1; |
d7e09d03 PT |
2368 | |
2369 | /* | |
2370 | * Move to "Unregistering" phase as reply was not unlinked yet. | |
2371 | */ | |
2372 | ptlrpc_rqphase_move(request, RQ_PHASE_UNREGISTERING); | |
2373 | ||
2374 | /* | |
2375 | * Do not wait for unlink to finish. | |
2376 | */ | |
2377 | if (async) | |
0a3bdb00 | 2378 | return 0; |
d7e09d03 PT |
2379 | |
2380 | /* | |
2381 | * We have to l_wait_event() whatever the result, to give liblustre | |
2382 | * a chance to run reply_in_callback(), and to make sure we've | |
2383 | * unlinked before returning a req to the pool. | |
2384 | */ | |
2385 | if (request->rq_set != NULL) | |
2386 | wq = &request->rq_set->set_waitq; | |
2387 | else | |
2388 | wq = &request->rq_reply_waitq; | |
2389 | ||
2390 | for (;;) { | |
2391 | /* Network access will complete in finite time but the HUGE | |
2392 | * timeout lets us CWARN for visibility of sluggish NALs */ | |
2393 | lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK), | |
2394 | cfs_time_seconds(1), NULL, NULL); | |
2395 | rc = l_wait_event(*wq, !ptlrpc_client_recv_or_unlink(request), | |
2396 | &lwi); | |
2397 | if (rc == 0) { | |
2398 | ptlrpc_rqphase_move(request, request->rq_next_phase); | |
0a3bdb00 | 2399 | return 1; |
d7e09d03 PT |
2400 | } |
2401 | ||
2402 | LASSERT(rc == -ETIMEDOUT); | |
cf378ff7 AL |
2403 | DEBUG_REQ(D_WARNING, request, |
2404 | "Unexpectedly long timeout rvcng=%d unlnk=%d/%d", | |
2405 | request->rq_receiving_reply, | |
2406 | request->rq_req_unlink, request->rq_reply_unlink); | |
d7e09d03 | 2407 | } |
0a3bdb00 | 2408 | return 0; |
d7e09d03 PT |
2409 | } |
2410 | EXPORT_SYMBOL(ptlrpc_unregister_reply); | |
2411 | ||
63d42578 HZ |
2412 | static void ptlrpc_free_request(struct ptlrpc_request *req) |
2413 | { | |
2414 | spin_lock(&req->rq_lock); | |
2415 | req->rq_replay = 0; | |
2416 | spin_unlock(&req->rq_lock); | |
2417 | ||
2418 | if (req->rq_commit_cb != NULL) | |
2419 | req->rq_commit_cb(req); | |
2420 | list_del_init(&req->rq_replay_list); | |
2421 | ||
2422 | __ptlrpc_req_finished(req, 1); | |
2423 | } | |
2424 | ||
2425 | /** | |
2426 | * the request is committed and dropped from the replay list of its import | |
2427 | */ | |
2428 | void ptlrpc_request_committed(struct ptlrpc_request *req, int force) | |
2429 | { | |
2430 | struct obd_import *imp = req->rq_import; | |
2431 | ||
2432 | spin_lock(&imp->imp_lock); | |
2433 | if (list_empty(&req->rq_replay_list)) { | |
2434 | spin_unlock(&imp->imp_lock); | |
2435 | return; | |
2436 | } | |
2437 | ||
2438 | if (force || req->rq_transno <= imp->imp_peer_committed_transno) | |
2439 | ptlrpc_free_request(req); | |
2440 | ||
2441 | spin_unlock(&imp->imp_lock); | |
2442 | } | |
2443 | EXPORT_SYMBOL(ptlrpc_request_committed); | |
2444 | ||
d7e09d03 PT |
2445 | /** |
2446 | * Iterates through replay_list on import and prunes | |
2447 | * all requests have transno smaller than last_committed for the | |
2448 | * import and don't have rq_replay set. | |
930cef9a | 2449 | * Since requests are sorted in transno order, stops when meeting first |
d7e09d03 PT |
2450 | * transno bigger than last_committed. |
2451 | * caller must hold imp->imp_lock | |
2452 | */ | |
2453 | void ptlrpc_free_committed(struct obd_import *imp) | |
2454 | { | |
63d42578 | 2455 | struct ptlrpc_request *req, *saved; |
d7e09d03 | 2456 | struct ptlrpc_request *last_req = NULL; /* temporary fire escape */ |
63d42578 | 2457 | bool skip_committed_list = true; |
d7e09d03 PT |
2458 | |
2459 | LASSERT(imp != NULL); | |
5e42bc9d | 2460 | assert_spin_locked(&imp->imp_lock); |
d7e09d03 PT |
2461 | |
2462 | if (imp->imp_peer_committed_transno == imp->imp_last_transno_checked && | |
2463 | imp->imp_generation == imp->imp_last_generation_checked) { | |
b0f5aad5 | 2464 | CDEBUG(D_INFO, "%s: skip recheck: last_committed %llu\n", |
d7e09d03 | 2465 | imp->imp_obd->obd_name, imp->imp_peer_committed_transno); |
d7e09d03 PT |
2466 | return; |
2467 | } | |
b0f5aad5 | 2468 | CDEBUG(D_RPCTRACE, "%s: committing for last_committed %llu gen %d\n", |
d7e09d03 PT |
2469 | imp->imp_obd->obd_name, imp->imp_peer_committed_transno, |
2470 | imp->imp_generation); | |
63d42578 HZ |
2471 | |
2472 | if (imp->imp_generation != imp->imp_last_generation_checked) | |
2473 | skip_committed_list = false; | |
2474 | ||
d7e09d03 PT |
2475 | imp->imp_last_transno_checked = imp->imp_peer_committed_transno; |
2476 | imp->imp_last_generation_checked = imp->imp_generation; | |
2477 | ||
63d42578 HZ |
2478 | list_for_each_entry_safe(req, saved, &imp->imp_replay_list, |
2479 | rq_replay_list) { | |
d7e09d03 PT |
2480 | /* XXX ok to remove when 1357 resolved - rread 05/29/03 */ |
2481 | LASSERT(req != last_req); | |
2482 | last_req = req; | |
2483 | ||
2484 | if (req->rq_transno == 0) { | |
2485 | DEBUG_REQ(D_EMERG, req, "zero transno during replay"); | |
2486 | LBUG(); | |
2487 | } | |
2488 | if (req->rq_import_generation < imp->imp_generation) { | |
2489 | DEBUG_REQ(D_RPCTRACE, req, "free request with old gen"); | |
a9b3e8f3 | 2490 | goto free_req; |
d7e09d03 PT |
2491 | } |
2492 | ||
d7e09d03 PT |
2493 | /* not yet committed */ |
2494 | if (req->rq_transno > imp->imp_peer_committed_transno) { | |
2495 | DEBUG_REQ(D_RPCTRACE, req, "stopping search"); | |
2496 | break; | |
2497 | } | |
2498 | ||
63d42578 HZ |
2499 | if (req->rq_replay) { |
2500 | DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)"); | |
2501 | list_move_tail(&req->rq_replay_list, | |
2502 | &imp->imp_committed_list); | |
2503 | continue; | |
2504 | } | |
2505 | ||
b0f5aad5 | 2506 | DEBUG_REQ(D_INFO, req, "commit (last_committed %llu)", |
d7e09d03 PT |
2507 | imp->imp_peer_committed_transno); |
2508 | free_req: | |
63d42578 HZ |
2509 | ptlrpc_free_request(req); |
2510 | } | |
2511 | if (skip_committed_list) | |
2512 | return; | |
2513 | ||
2514 | list_for_each_entry_safe(req, saved, &imp->imp_committed_list, | |
2515 | rq_replay_list) { | |
2516 | LASSERT(req->rq_transno != 0); | |
2517 | if (req->rq_import_generation < imp->imp_generation) { | |
2518 | DEBUG_REQ(D_RPCTRACE, req, "free stale open request"); | |
2519 | ptlrpc_free_request(req); | |
2520 | } | |
d7e09d03 | 2521 | } |
d7e09d03 PT |
2522 | } |
2523 | ||
2524 | void ptlrpc_cleanup_client(struct obd_import *imp) | |
2525 | { | |
d7e09d03 PT |
2526 | } |
2527 | EXPORT_SYMBOL(ptlrpc_cleanup_client); | |
2528 | ||
2529 | /** | |
2530 | * Schedule previously sent request for resend. | |
2531 | * For bulk requests we assign new xid (to avoid problems with | |
2532 | * lost replies and therefore several transfers landing into same buffer | |
2533 | * from different sending attempts). | |
2534 | */ | |
2535 | void ptlrpc_resend_req(struct ptlrpc_request *req) | |
2536 | { | |
2537 | DEBUG_REQ(D_HA, req, "going to resend"); | |
5c689e68 AB |
2538 | spin_lock(&req->rq_lock); |
2539 | ||
2540 | /* Request got reply but linked to the import list still. | |
2541 | Let ptlrpc_check_set() to process it. */ | |
2542 | if (ptlrpc_client_replied(req)) { | |
2543 | spin_unlock(&req->rq_lock); | |
2544 | DEBUG_REQ(D_HA, req, "it has reply, so skip it"); | |
2545 | return; | |
2546 | } | |
2547 | ||
d7e09d03 PT |
2548 | lustre_msg_set_handle(req->rq_reqmsg, &(struct lustre_handle){ 0 }); |
2549 | req->rq_status = -EAGAIN; | |
2550 | ||
d7e09d03 PT |
2551 | req->rq_resend = 1; |
2552 | req->rq_net_err = 0; | |
2553 | req->rq_timedout = 0; | |
2554 | if (req->rq_bulk) { | |
2555 | __u64 old_xid = req->rq_xid; | |
2556 | ||
2557 | /* ensure previous bulk fails */ | |
2558 | req->rq_xid = ptlrpc_next_xid(); | |
b0f5aad5 | 2559 | CDEBUG(D_HA, "resend bulk old x%llu new x%llu\n", |
d7e09d03 PT |
2560 | old_xid, req->rq_xid); |
2561 | } | |
2562 | ptlrpc_client_wake_req(req); | |
2563 | spin_unlock(&req->rq_lock); | |
2564 | } | |
2565 | EXPORT_SYMBOL(ptlrpc_resend_req); | |
2566 | ||
2567 | /* XXX: this function and rq_status are currently unused */ | |
2568 | void ptlrpc_restart_req(struct ptlrpc_request *req) | |
2569 | { | |
2570 | DEBUG_REQ(D_HA, req, "restarting (possibly-)completed request"); | |
2571 | req->rq_status = -ERESTARTSYS; | |
2572 | ||
2573 | spin_lock(&req->rq_lock); | |
2574 | req->rq_restart = 1; | |
2575 | req->rq_timedout = 0; | |
2576 | ptlrpc_client_wake_req(req); | |
2577 | spin_unlock(&req->rq_lock); | |
2578 | } | |
2579 | EXPORT_SYMBOL(ptlrpc_restart_req); | |
2580 | ||
2581 | /** | |
2582 | * Grab additional reference on a request \a req | |
2583 | */ | |
2584 | struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req) | |
2585 | { | |
d7e09d03 | 2586 | atomic_inc(&req->rq_refcount); |
0a3bdb00 | 2587 | return req; |
d7e09d03 PT |
2588 | } |
2589 | EXPORT_SYMBOL(ptlrpc_request_addref); | |
2590 | ||
2591 | /** | |
2592 | * Add a request to import replay_list. | |
2593 | * Must be called under imp_lock | |
2594 | */ | |
2595 | void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, | |
2596 | struct obd_import *imp) | |
2597 | { | |
2598 | struct list_head *tmp; | |
2599 | ||
5e42bc9d | 2600 | assert_spin_locked(&imp->imp_lock); |
d7e09d03 PT |
2601 | |
2602 | if (req->rq_transno == 0) { | |
2603 | DEBUG_REQ(D_EMERG, req, "saving request with zero transno"); | |
2604 | LBUG(); | |
2605 | } | |
2606 | ||
2607 | /* clear this for new requests that were resent as well | |
2608 | as resent replayed requests. */ | |
2609 | lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT); | |
2610 | ||
2611 | /* don't re-add requests that have been replayed */ | |
2612 | if (!list_empty(&req->rq_replay_list)) | |
2613 | return; | |
2614 | ||
2615 | lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY); | |
2616 | ||
2617 | LASSERT(imp->imp_replayable); | |
2618 | /* Balanced in ptlrpc_free_committed, usually. */ | |
2619 | ptlrpc_request_addref(req); | |
2620 | list_for_each_prev(tmp, &imp->imp_replay_list) { | |
2621 | struct ptlrpc_request *iter = | |
2622 | list_entry(tmp, struct ptlrpc_request, | |
2623 | rq_replay_list); | |
2624 | ||
2625 | /* We may have duplicate transnos if we create and then | |
2626 | * open a file, or for closes retained if to match creating | |
2627 | * opens, so use req->rq_xid as a secondary key. | |
2628 | * (See bugs 684, 685, and 428.) | |
2629 | * XXX no longer needed, but all opens need transnos! | |
2630 | */ | |
2631 | if (iter->rq_transno > req->rq_transno) | |
2632 | continue; | |
2633 | ||
2634 | if (iter->rq_transno == req->rq_transno) { | |
2635 | LASSERT(iter->rq_xid != req->rq_xid); | |
2636 | if (iter->rq_xid > req->rq_xid) | |
2637 | continue; | |
2638 | } | |
2639 | ||
2640 | list_add(&req->rq_replay_list, &iter->rq_replay_list); | |
2641 | return; | |
2642 | } | |
2643 | ||
2644 | list_add(&req->rq_replay_list, &imp->imp_replay_list); | |
2645 | } | |
2646 | EXPORT_SYMBOL(ptlrpc_retain_replayable_request); | |
2647 | ||
2648 | /** | |
2649 | * Send request and wait until it completes. | |
2650 | * Returns request processing status. | |
2651 | */ | |
2652 | int ptlrpc_queue_wait(struct ptlrpc_request *req) | |
2653 | { | |
2654 | struct ptlrpc_request_set *set; | |
2655 | int rc; | |
d7e09d03 PT |
2656 | |
2657 | LASSERT(req->rq_set == NULL); | |
2658 | LASSERT(!req->rq_receiving_reply); | |
2659 | ||
2660 | set = ptlrpc_prep_set(); | |
2661 | if (set == NULL) { | |
2662 | CERROR("Unable to allocate ptlrpc set."); | |
0a3bdb00 | 2663 | return -ENOMEM; |
d7e09d03 PT |
2664 | } |
2665 | ||
2666 | /* for distributed debugging */ | |
2667 | lustre_msg_set_status(req->rq_reqmsg, current_pid()); | |
2668 | ||
2669 | /* add a ref for the set (see comment in ptlrpc_set_add_req) */ | |
2670 | ptlrpc_request_addref(req); | |
2671 | ptlrpc_set_add_req(set, req); | |
2672 | rc = ptlrpc_set_wait(set); | |
2673 | ptlrpc_set_destroy(set); | |
2674 | ||
0a3bdb00 | 2675 | return rc; |
d7e09d03 PT |
2676 | } |
2677 | EXPORT_SYMBOL(ptlrpc_queue_wait); | |
2678 | ||
2679 | struct ptlrpc_replay_async_args { | |
2680 | int praa_old_state; | |
2681 | int praa_old_status; | |
2682 | }; | |
2683 | ||
2684 | /** | |
2685 | * Callback used for replayed requests reply processing. | |
930cef9a | 2686 | * In case of successful reply calls registered request replay callback. |
d7e09d03 PT |
2687 | * In case of error restart replay process. |
2688 | */ | |
2689 | static int ptlrpc_replay_interpret(const struct lu_env *env, | |
2690 | struct ptlrpc_request *req, | |
0028d585 | 2691 | void *data, int rc) |
d7e09d03 PT |
2692 | { |
2693 | struct ptlrpc_replay_async_args *aa = data; | |
2694 | struct obd_import *imp = req->rq_import; | |
2695 | ||
d7e09d03 PT |
2696 | atomic_dec(&imp->imp_replay_inflight); |
2697 | ||
2698 | if (!ptlrpc_client_replied(req)) { | |
2699 | CERROR("request replay timed out, restarting recovery\n"); | |
a9b3e8f3 JL |
2700 | rc = -ETIMEDOUT; |
2701 | goto out; | |
d7e09d03 PT |
2702 | } |
2703 | ||
2704 | if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR && | |
2705 | (lustre_msg_get_status(req->rq_repmsg) == -ENOTCONN || | |
a9b3e8f3 JL |
2706 | lustre_msg_get_status(req->rq_repmsg) == -ENODEV)) { |
2707 | rc = lustre_msg_get_status(req->rq_repmsg); | |
2708 | goto out; | |
2709 | } | |
d7e09d03 PT |
2710 | |
2711 | /** VBR: check version failure */ | |
2712 | if (lustre_msg_get_status(req->rq_repmsg) == -EOVERFLOW) { | |
2713 | /** replay was failed due to version mismatch */ | |
2714 | DEBUG_REQ(D_WARNING, req, "Version mismatch during replay\n"); | |
2715 | spin_lock(&imp->imp_lock); | |
2716 | imp->imp_vbr_failed = 1; | |
2717 | imp->imp_no_lock_replay = 1; | |
2718 | spin_unlock(&imp->imp_lock); | |
2719 | lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status); | |
2720 | } else { | |
2721 | /** The transno had better not change over replay. */ | |
2722 | LASSERTF(lustre_msg_get_transno(req->rq_reqmsg) == | |
2723 | lustre_msg_get_transno(req->rq_repmsg) || | |
2724 | lustre_msg_get_transno(req->rq_repmsg) == 0, | |
55f5a824 | 2725 | "%#llx/%#llx\n", |
d7e09d03 PT |
2726 | lustre_msg_get_transno(req->rq_reqmsg), |
2727 | lustre_msg_get_transno(req->rq_repmsg)); | |
2728 | } | |
2729 | ||
2730 | spin_lock(&imp->imp_lock); | |
2731 | /** if replays by version then gap occur on server, no trust to locks */ | |
2732 | if (lustre_msg_get_flags(req->rq_repmsg) & MSG_VERSION_REPLAY) | |
2733 | imp->imp_no_lock_replay = 1; | |
2734 | imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg); | |
2735 | spin_unlock(&imp->imp_lock); | |
2736 | LASSERT(imp->imp_last_replay_transno); | |
2737 | ||
2738 | /* transaction number shouldn't be bigger than the latest replayed */ | |
2739 | if (req->rq_transno > lustre_msg_get_transno(req->rq_reqmsg)) { | |
2740 | DEBUG_REQ(D_ERROR, req, | |
b0f5aad5 GKH |
2741 | "Reported transno %llu is bigger than the replayed one: %llu", |
2742 | req->rq_transno, | |
d7e09d03 | 2743 | lustre_msg_get_transno(req->rq_reqmsg)); |
a9b3e8f3 JL |
2744 | rc = -EINVAL; |
2745 | goto out; | |
d7e09d03 PT |
2746 | } |
2747 | ||
2748 | DEBUG_REQ(D_HA, req, "got rep"); | |
2749 | ||
2750 | /* let the callback do fixups, possibly including in the request */ | |
2751 | if (req->rq_replay_cb) | |
2752 | req->rq_replay_cb(req); | |
2753 | ||
2754 | if (ptlrpc_client_replied(req) && | |
2755 | lustre_msg_get_status(req->rq_repmsg) != aa->praa_old_status) { | |
2756 | DEBUG_REQ(D_ERROR, req, "status %d, old was %d", | |
2757 | lustre_msg_get_status(req->rq_repmsg), | |
2758 | aa->praa_old_status); | |
2759 | } else { | |
2760 | /* Put it back for re-replay. */ | |
2761 | lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status); | |
2762 | } | |
2763 | ||
2764 | /* | |
2765 | * Errors while replay can set transno to 0, but | |
2766 | * imp_last_replay_transno shouldn't be set to 0 anyway | |
2767 | */ | |
2768 | if (req->rq_transno == 0) | |
2769 | CERROR("Transno is 0 during replay!\n"); | |
2770 | ||
2771 | /* continue with recovery */ | |
2772 | rc = ptlrpc_import_recovery_state_machine(imp); | |
2773 | out: | |
2774 | req->rq_send_state = aa->praa_old_state; | |
2775 | ||
2776 | if (rc != 0) | |
2777 | /* this replay failed, so restart recovery */ | |
2778 | ptlrpc_connect_import(imp); | |
2779 | ||
0a3bdb00 | 2780 | return rc; |
d7e09d03 PT |
2781 | } |
2782 | ||
2783 | /** | |
2784 | * Prepares and queues request for replay. | |
2785 | * Adds it to ptlrpcd queue for actual sending. | |
2786 | * Returns 0 on success. | |
2787 | */ | |
2788 | int ptlrpc_replay_req(struct ptlrpc_request *req) | |
2789 | { | |
2790 | struct ptlrpc_replay_async_args *aa; | |
d7e09d03 PT |
2791 | |
2792 | LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY); | |
2793 | ||
3949015e | 2794 | LASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); |
d7e09d03 | 2795 | aa = ptlrpc_req_async_args(req); |
ec83e611 | 2796 | memset(aa, 0, sizeof(*aa)); |
d7e09d03 PT |
2797 | |
2798 | /* Prepare request to be resent with ptlrpcd */ | |
2799 | aa->praa_old_state = req->rq_send_state; | |
2800 | req->rq_send_state = LUSTRE_IMP_REPLAY; | |
2801 | req->rq_phase = RQ_PHASE_NEW; | |
2802 | req->rq_next_phase = RQ_PHASE_UNDEFINED; | |
2803 | if (req->rq_repmsg) | |
2804 | aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg); | |
2805 | req->rq_status = 0; | |
2806 | req->rq_interpret_reply = ptlrpc_replay_interpret; | |
2807 | /* Readjust the timeout for current conditions */ | |
2808 | ptlrpc_at_set_req_timeout(req); | |
2809 | ||
2810 | /* Tell server the net_latency, so the server can calculate how long | |
2811 | * it should wait for next replay */ | |
2812 | lustre_msg_set_service_time(req->rq_reqmsg, | |
2813 | ptlrpc_at_get_net_latency(req)); | |
2814 | DEBUG_REQ(D_HA, req, "REPLAY"); | |
2815 | ||
2816 | atomic_inc(&req->rq_import->imp_replay_inflight); | |
2817 | ptlrpc_request_addref(req); /* ptlrpcd needs a ref */ | |
2818 | ||
2819 | ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1); | |
0a3bdb00 | 2820 | return 0; |
d7e09d03 PT |
2821 | } |
2822 | EXPORT_SYMBOL(ptlrpc_replay_req); | |
2823 | ||
2824 | /** | |
2825 | * Aborts all in-flight request on import \a imp sending and delayed lists | |
2826 | */ | |
2827 | void ptlrpc_abort_inflight(struct obd_import *imp) | |
2828 | { | |
2829 | struct list_head *tmp, *n; | |
d7e09d03 PT |
2830 | |
2831 | /* Make sure that no new requests get processed for this import. | |
2832 | * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing | |
2833 | * this flag and then putting requests on sending_list or delayed_list. | |
2834 | */ | |
2835 | spin_lock(&imp->imp_lock); | |
2836 | ||
2837 | /* XXX locking? Maybe we should remove each request with the list | |
2838 | * locked? Also, how do we know if the requests on the list are | |
2839 | * being freed at this time? | |
2840 | */ | |
2841 | list_for_each_safe(tmp, n, &imp->imp_sending_list) { | |
2842 | struct ptlrpc_request *req = | |
2843 | list_entry(tmp, struct ptlrpc_request, rq_list); | |
2844 | ||
2845 | DEBUG_REQ(D_RPCTRACE, req, "inflight"); | |
2846 | ||
2847 | spin_lock(&req->rq_lock); | |
2848 | if (req->rq_import_generation < imp->imp_generation) { | |
2849 | req->rq_err = 1; | |
2850 | req->rq_status = -EIO; | |
2851 | ptlrpc_client_wake_req(req); | |
2852 | } | |
2853 | spin_unlock(&req->rq_lock); | |
2854 | } | |
2855 | ||
2856 | list_for_each_safe(tmp, n, &imp->imp_delayed_list) { | |
2857 | struct ptlrpc_request *req = | |
2858 | list_entry(tmp, struct ptlrpc_request, rq_list); | |
2859 | ||
2860 | DEBUG_REQ(D_RPCTRACE, req, "aborting waiting req"); | |
2861 | ||
2862 | spin_lock(&req->rq_lock); | |
2863 | if (req->rq_import_generation < imp->imp_generation) { | |
2864 | req->rq_err = 1; | |
2865 | req->rq_status = -EIO; | |
2866 | ptlrpc_client_wake_req(req); | |
2867 | } | |
2868 | spin_unlock(&req->rq_lock); | |
2869 | } | |
2870 | ||
2871 | /* Last chance to free reqs left on the replay list, but we | |
2872 | * will still leak reqs that haven't committed. */ | |
2873 | if (imp->imp_replayable) | |
2874 | ptlrpc_free_committed(imp); | |
2875 | ||
2876 | spin_unlock(&imp->imp_lock); | |
d7e09d03 PT |
2877 | } |
2878 | EXPORT_SYMBOL(ptlrpc_abort_inflight); | |
2879 | ||
2880 | /** | |
2881 | * Abort all uncompleted requests in request set \a set | |
2882 | */ | |
2883 | void ptlrpc_abort_set(struct ptlrpc_request_set *set) | |
2884 | { | |
2885 | struct list_head *tmp, *pos; | |
2886 | ||
2887 | LASSERT(set != NULL); | |
2888 | ||
2889 | list_for_each_safe(pos, tmp, &set->set_requests) { | |
2890 | struct ptlrpc_request *req = | |
2891 | list_entry(pos, struct ptlrpc_request, | |
2892 | rq_set_chain); | |
2893 | ||
2894 | spin_lock(&req->rq_lock); | |
2895 | if (req->rq_phase != RQ_PHASE_RPC) { | |
2896 | spin_unlock(&req->rq_lock); | |
2897 | continue; | |
2898 | } | |
2899 | ||
2900 | req->rq_err = 1; | |
2901 | req->rq_status = -EINTR; | |
2902 | ptlrpc_client_wake_req(req); | |
2903 | spin_unlock(&req->rq_lock); | |
2904 | } | |
2905 | } | |
2906 | ||
2907 | static __u64 ptlrpc_last_xid; | |
2908 | static spinlock_t ptlrpc_last_xid_lock; | |
2909 | ||
2910 | /** | |
2911 | * Initialize the XID for the node. This is common among all requests on | |
2912 | * this node, and only requires the property that it is monotonically | |
2913 | * increasing. It does not need to be sequential. Since this is also used | |
2914 | * as the RDMA match bits, it is important that a single client NOT have | |
2915 | * the same match bits for two different in-flight requests, hence we do | |
2916 | * NOT want to have an XID per target or similar. | |
2917 | * | |
2918 | * To avoid an unlikely collision between match bits after a client reboot | |
2919 | * (which would deliver old data into the wrong RDMA buffer) initialize | |
2920 | * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s. | |
2921 | * If the time is clearly incorrect, we instead use a 62-bit random number. | |
2922 | * In the worst case the random number will overflow 1M RPCs per second in | |
2923 | * 9133 years, or permutations thereof. | |
2924 | */ | |
2925 | #define YEAR_2004 (1ULL << 30) | |
2926 | void ptlrpc_init_xid(void) | |
2927 | { | |
7264b8a5 | 2928 | time_t now = get_seconds(); |
d7e09d03 PT |
2929 | |
2930 | spin_lock_init(&ptlrpc_last_xid_lock); | |
2931 | if (now < YEAR_2004) { | |
2932 | cfs_get_random_bytes(&ptlrpc_last_xid, sizeof(ptlrpc_last_xid)); | |
2933 | ptlrpc_last_xid >>= 2; | |
2934 | ptlrpc_last_xid |= (1ULL << 61); | |
2935 | } else { | |
2936 | ptlrpc_last_xid = (__u64)now << 20; | |
2937 | } | |
2938 | ||
930cef9a | 2939 | /* Always need to be aligned to a power-of-two for multi-bulk BRW */ |
d7e09d03 PT |
2940 | CLASSERT((PTLRPC_BULK_OPS_COUNT & (PTLRPC_BULK_OPS_COUNT - 1)) == 0); |
2941 | ptlrpc_last_xid &= PTLRPC_BULK_OPS_MASK; | |
2942 | } | |
2943 | ||
2944 | /** | |
2945 | * Increase xid and returns resulting new value to the caller. | |
2946 | * | |
2947 | * Multi-bulk BRW RPCs consume multiple XIDs for each bulk transfer, starting | |
2948 | * at the returned xid, up to xid + PTLRPC_BULK_OPS_COUNT - 1. The BRW RPC | |
2949 | * itself uses the last bulk xid needed, so the server can determine the | |
2950 | * the number of bulk transfers from the RPC XID and a bitmask. The starting | |
2951 | * xid must align to a power-of-two value. | |
2952 | * | |
2953 | * This is assumed to be true due to the initial ptlrpc_last_xid | |
2954 | * value also being initialized to a power-of-two value. LU-1431 | |
2955 | */ | |
2956 | __u64 ptlrpc_next_xid(void) | |
2957 | { | |
2958 | __u64 next; | |
2959 | ||
2960 | spin_lock(&ptlrpc_last_xid_lock); | |
2961 | next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT; | |
2962 | ptlrpc_last_xid = next; | |
2963 | spin_unlock(&ptlrpc_last_xid_lock); | |
2964 | ||
2965 | return next; | |
2966 | } | |
2967 | EXPORT_SYMBOL(ptlrpc_next_xid); | |
2968 | ||
2969 | /** | |
2970 | * Get a glimpse at what next xid value might have been. | |
2971 | * Returns possible next xid. | |
2972 | */ | |
2973 | __u64 ptlrpc_sample_next_xid(void) | |
2974 | { | |
2975 | #if BITS_PER_LONG == 32 | |
2976 | /* need to avoid possible word tearing on 32-bit systems */ | |
2977 | __u64 next; | |
2978 | ||
2979 | spin_lock(&ptlrpc_last_xid_lock); | |
2980 | next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT; | |
2981 | spin_unlock(&ptlrpc_last_xid_lock); | |
2982 | ||
2983 | return next; | |
2984 | #else | |
2985 | /* No need to lock, since returned value is racy anyways */ | |
2986 | return ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT; | |
2987 | #endif | |
2988 | } | |
2989 | EXPORT_SYMBOL(ptlrpc_sample_next_xid); | |
2990 | ||
2991 | /** | |
2992 | * Functions for operating ptlrpc workers. | |
2993 | * | |
2994 | * A ptlrpc work is a function which will be running inside ptlrpc context. | |
2995 | * The callback shouldn't sleep otherwise it will block that ptlrpcd thread. | |
2996 | * | |
2997 | * 1. after a work is created, it can be used many times, that is: | |
2998 | * handler = ptlrpcd_alloc_work(); | |
2999 | * ptlrpcd_queue_work(); | |
3000 | * | |
3001 | * queue it again when necessary: | |
3002 | * ptlrpcd_queue_work(); | |
3003 | * ptlrpcd_destroy_work(); | |
3004 | * 2. ptlrpcd_queue_work() can be called by multiple processes meanwhile, but | |
3005 | * it will only be queued once in any time. Also as its name implies, it may | |
3006 | * have delay before it really runs by ptlrpcd thread. | |
3007 | */ | |
3008 | struct ptlrpc_work_async_args { | |
d7e09d03 PT |
3009 | int (*cb)(const struct lu_env *, void *); |
3010 | void *cbdata; | |
3011 | }; | |
3012 | ||
82a373ae LZ |
3013 | static void ptlrpcd_add_work_req(struct ptlrpc_request *req) |
3014 | { | |
3015 | /* re-initialize the req */ | |
3016 | req->rq_timeout = obd_timeout; | |
7264b8a5 | 3017 | req->rq_sent = get_seconds(); |
82a373ae LZ |
3018 | req->rq_deadline = req->rq_sent + req->rq_timeout; |
3019 | req->rq_reply_deadline = req->rq_deadline; | |
3020 | req->rq_phase = RQ_PHASE_INTERPRET; | |
3021 | req->rq_next_phase = RQ_PHASE_COMPLETE; | |
3022 | req->rq_xid = ptlrpc_next_xid(); | |
3023 | req->rq_import_generation = req->rq_import->imp_generation; | |
3024 | ||
3025 | ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1); | |
3026 | } | |
d7e09d03 PT |
3027 | |
3028 | static int work_interpreter(const struct lu_env *env, | |
3029 | struct ptlrpc_request *req, void *data, int rc) | |
3030 | { | |
3031 | struct ptlrpc_work_async_args *arg = data; | |
3032 | ||
82a373ae | 3033 | LASSERT(ptlrpcd_check_work(req)); |
d7e09d03 PT |
3034 | LASSERT(arg->cb != NULL); |
3035 | ||
82a373ae LZ |
3036 | rc = arg->cb(env, arg->cbdata); |
3037 | ||
3038 | list_del_init(&req->rq_set_chain); | |
3039 | req->rq_set = NULL; | |
3040 | ||
3041 | if (atomic_dec_return(&req->rq_refcount) > 1) { | |
3042 | atomic_set(&req->rq_refcount, 2); | |
3043 | ptlrpcd_add_work_req(req); | |
3044 | } | |
3045 | return rc; | |
3046 | } | |
3047 | ||
3048 | static int worker_format; | |
3049 | ||
3050 | static int ptlrpcd_check_work(struct ptlrpc_request *req) | |
3051 | { | |
3052 | return req->rq_pill.rc_fmt == (void *)&worker_format; | |
d7e09d03 PT |
3053 | } |
3054 | ||
3055 | /** | |
3056 | * Create a work for ptlrpc. | |
3057 | */ | |
3058 | void *ptlrpcd_alloc_work(struct obd_import *imp, | |
3059 | int (*cb)(const struct lu_env *, void *), void *cbdata) | |
3060 | { | |
3061 | struct ptlrpc_request *req = NULL; | |
3062 | struct ptlrpc_work_async_args *args; | |
d7e09d03 PT |
3063 | |
3064 | might_sleep(); | |
3065 | ||
3066 | if (cb == NULL) | |
0a3bdb00 | 3067 | return ERR_PTR(-EINVAL); |
d7e09d03 PT |
3068 | |
3069 | /* copy some code from deprecated fakereq. */ | |
0be19afa | 3070 | req = ptlrpc_request_cache_alloc(GFP_NOFS); |
d7e09d03 PT |
3071 | if (req == NULL) { |
3072 | CERROR("ptlrpc: run out of memory!\n"); | |
0a3bdb00 | 3073 | return ERR_PTR(-ENOMEM); |
d7e09d03 PT |
3074 | } |
3075 | ||
3076 | req->rq_send_state = LUSTRE_IMP_FULL; | |
3077 | req->rq_type = PTL_RPC_MSG_REQUEST; | |
3078 | req->rq_import = class_import_get(imp); | |
3079 | req->rq_export = NULL; | |
3080 | req->rq_interpret_reply = work_interpreter; | |
3081 | /* don't want reply */ | |
3082 | req->rq_receiving_reply = 0; | |
cf378ff7 | 3083 | req->rq_req_unlink = req->rq_reply_unlink = 0; |
d7e09d03 | 3084 | req->rq_no_delay = req->rq_no_resend = 1; |
82a373ae | 3085 | req->rq_pill.rc_fmt = (void *)&worker_format; |
d7e09d03 PT |
3086 | |
3087 | spin_lock_init(&req->rq_lock); | |
3088 | INIT_LIST_HEAD(&req->rq_list); | |
3089 | INIT_LIST_HEAD(&req->rq_replay_list); | |
3090 | INIT_LIST_HEAD(&req->rq_set_chain); | |
3091 | INIT_LIST_HEAD(&req->rq_history_list); | |
3092 | INIT_LIST_HEAD(&req->rq_exp_list); | |
3093 | init_waitqueue_head(&req->rq_reply_waitq); | |
3094 | init_waitqueue_head(&req->rq_set_waitq); | |
3095 | atomic_set(&req->rq_refcount, 1); | |
3096 | ||
3949015e | 3097 | CLASSERT(sizeof(*args) <= sizeof(req->rq_async_args)); |
d7e09d03 | 3098 | args = ptlrpc_req_async_args(req); |
d7e09d03 PT |
3099 | args->cb = cb; |
3100 | args->cbdata = cbdata; | |
3101 | ||
0a3bdb00 | 3102 | return req; |
d7e09d03 PT |
3103 | } |
3104 | EXPORT_SYMBOL(ptlrpcd_alloc_work); | |
3105 | ||
3106 | void ptlrpcd_destroy_work(void *handler) | |
3107 | { | |
3108 | struct ptlrpc_request *req = handler; | |
3109 | ||
3110 | if (req) | |
3111 | ptlrpc_req_finished(req); | |
3112 | } | |
3113 | EXPORT_SYMBOL(ptlrpcd_destroy_work); | |
3114 | ||
3115 | int ptlrpcd_queue_work(void *handler) | |
3116 | { | |
3117 | struct ptlrpc_request *req = handler; | |
3118 | ||
3119 | /* | |
3120 | * Check if the req is already being queued. | |
3121 | * | |
3122 | * Here comes a trick: it lacks a way of checking if a req is being | |
3123 | * processed reliably in ptlrpc. Here I have to use refcount of req | |
3124 | * for this purpose. This is okay because the caller should use this | |
3125 | * req as opaque data. - Jinshan | |
3126 | */ | |
3127 | LASSERT(atomic_read(&req->rq_refcount) > 0); | |
82a373ae LZ |
3128 | if (atomic_inc_return(&req->rq_refcount) == 2) |
3129 | ptlrpcd_add_work_req(req); | |
d7e09d03 PT |
3130 | return 0; |
3131 | } | |
3132 | EXPORT_SYMBOL(ptlrpcd_queue_work); |