Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
18 | * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf | |
19 | * | |
20 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, | |
21 | * CA 95054 USA or visit www.sun.com if you need additional information or | |
22 | * have any questions. | |
23 | * | |
24 | * GPL HEADER END | |
25 | */ | |
26 | /* | |
27 | * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. | |
28 | * Use is subject to license terms. | |
29 | * | |
30 | * Copyright (c) 2011, 2012, Intel Corporation. | |
31 | */ | |
32 | /* | |
33 | * This file is part of Lustre, http://www.lustre.org/ | |
34 | * Lustre is a trademark of Sun Microsystems, Inc. | |
35 | */ | |
36 | ||
37 | /** Implementation of client-side PortalRPC interfaces */ | |
38 | ||
39 | #define DEBUG_SUBSYSTEM S_RPC | |
40 | ||
e27db149 GKH |
41 | #include "../include/obd_support.h" |
42 | #include "../include/obd_class.h" | |
43 | #include "../include/lustre_lib.h" | |
44 | #include "../include/lustre_ha.h" | |
45 | #include "../include/lustre_import.h" | |
46 | #include "../include/lustre_req_layout.h" | |
d7e09d03 PT |
47 | |
48 | #include "ptlrpc_internal.h" | |
49 | ||
50 | static int ptlrpc_send_new_req(struct ptlrpc_request *req); | |
82a373ae | 51 | static int ptlrpcd_check_work(struct ptlrpc_request *req); |
d7e09d03 PT |
52 | |
53 | /** | |
54 | * Initialize passed in client structure \a cl. | |
55 | */ | |
56 | void ptlrpc_init_client(int req_portal, int rep_portal, char *name, | |
57 | struct ptlrpc_client *cl) | |
58 | { | |
59 | cl->cli_request_portal = req_portal; | |
d0bfef31 CH |
60 | cl->cli_reply_portal = rep_portal; |
61 | cl->cli_name = name; | |
d7e09d03 PT |
62 | } |
63 | EXPORT_SYMBOL(ptlrpc_init_client); | |
64 | ||
65 | /** | |
930cef9a | 66 | * Return PortalRPC connection for remote uud \a uuid |
d7e09d03 PT |
67 | */ |
68 | struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid) | |
69 | { | |
70 | struct ptlrpc_connection *c; | |
d0bfef31 CH |
71 | lnet_nid_t self; |
72 | lnet_process_id_t peer; | |
73 | int err; | |
d7e09d03 PT |
74 | |
75 | /* ptlrpc_uuid_to_peer() initializes its 2nd parameter | |
76 | * before accessing its values. */ | |
77 | /* coverity[uninit_use_in_call] */ | |
78 | err = ptlrpc_uuid_to_peer(uuid, &peer, &self); | |
79 | if (err != 0) { | |
80 | CNETERR("cannot find peer %s!\n", uuid->uuid); | |
81 | return NULL; | |
82 | } | |
83 | ||
84 | c = ptlrpc_connection_get(peer, self, uuid); | |
85 | if (c) { | |
86 | memcpy(c->c_remote_uuid.uuid, | |
87 | uuid->uuid, sizeof(c->c_remote_uuid.uuid)); | |
88 | } | |
89 | ||
90 | CDEBUG(D_INFO, "%s -> %p\n", uuid->uuid, c); | |
91 | ||
92 | return c; | |
93 | } | |
94 | EXPORT_SYMBOL(ptlrpc_uuid_to_connection); | |
95 | ||
96 | /** | |
97 | * Allocate and initialize new bulk descriptor on the sender. | |
98 | * Returns pointer to the descriptor or NULL on error. | |
99 | */ | |
100 | struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw, | |
101 | unsigned type, unsigned portal) | |
102 | { | |
103 | struct ptlrpc_bulk_desc *desc; | |
104 | int i; | |
105 | ||
9ae10597 JL |
106 | desc = kzalloc(offsetof(struct ptlrpc_bulk_desc, bd_iov[npages]), |
107 | GFP_NOFS); | |
d7e09d03 PT |
108 | if (!desc) |
109 | return NULL; | |
110 | ||
111 | spin_lock_init(&desc->bd_lock); | |
112 | init_waitqueue_head(&desc->bd_waitq); | |
113 | desc->bd_max_iov = npages; | |
114 | desc->bd_iov_count = 0; | |
115 | desc->bd_portal = portal; | |
116 | desc->bd_type = type; | |
117 | desc->bd_md_count = 0; | |
118 | LASSERT(max_brw > 0); | |
119 | desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT); | |
120 | /* PTLRPC_BULK_OPS_COUNT is the compile-time transfer limit for this | |
121 | * node. Negotiated ocd_brw_size will always be <= this number. */ | |
122 | for (i = 0; i < PTLRPC_BULK_OPS_COUNT; i++) | |
123 | LNetInvalidateHandle(&desc->bd_mds[i]); | |
124 | ||
125 | return desc; | |
126 | } | |
127 | ||
128 | /** | |
129 | * Prepare bulk descriptor for specified outgoing request \a req that | |
130 | * can fit \a npages * pages. \a type is bulk type. \a portal is where | |
131 | * the bulk to be sent. Used on client-side. | |
930cef9a | 132 | * Returns pointer to newly allocated initialized bulk descriptor or NULL on |
d7e09d03 PT |
133 | * error. |
134 | */ | |
135 | struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req, | |
136 | unsigned npages, unsigned max_brw, | |
137 | unsigned type, unsigned portal) | |
138 | { | |
139 | struct obd_import *imp = req->rq_import; | |
140 | struct ptlrpc_bulk_desc *desc; | |
141 | ||
d7e09d03 PT |
142 | LASSERT(type == BULK_PUT_SINK || type == BULK_GET_SOURCE); |
143 | desc = ptlrpc_new_bulk(npages, max_brw, type, portal); | |
144 | if (desc == NULL) | |
0a3bdb00 | 145 | return NULL; |
d7e09d03 PT |
146 | |
147 | desc->bd_import_generation = req->rq_import_generation; | |
148 | desc->bd_import = class_import_get(imp); | |
149 | desc->bd_req = req; | |
150 | ||
d0bfef31 | 151 | desc->bd_cbid.cbid_fn = client_bulk_callback; |
d7e09d03 PT |
152 | desc->bd_cbid.cbid_arg = desc; |
153 | ||
154 | /* This makes req own desc, and free it when she frees herself */ | |
155 | req->rq_bulk = desc; | |
156 | ||
157 | return desc; | |
158 | } | |
159 | EXPORT_SYMBOL(ptlrpc_prep_bulk_imp); | |
160 | ||
161 | /** | |
162 | * Add a page \a page to the bulk descriptor \a desc. | |
163 | * Data to transfer in the page starts at offset \a pageoffset and | |
164 | * amount of data to transfer from the page is \a len | |
165 | */ | |
166 | void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, | |
167 | struct page *page, int pageoffset, int len, int pin) | |
168 | { | |
169 | LASSERT(desc->bd_iov_count < desc->bd_max_iov); | |
170 | LASSERT(page != NULL); | |
171 | LASSERT(pageoffset >= 0); | |
172 | LASSERT(len > 0); | |
173 | LASSERT(pageoffset + len <= PAGE_CACHE_SIZE); | |
174 | ||
175 | desc->bd_nob += len; | |
176 | ||
177 | if (pin) | |
178 | page_cache_get(page); | |
179 | ||
180 | ptlrpc_add_bulk_page(desc, page, pageoffset, len); | |
181 | } | |
182 | EXPORT_SYMBOL(__ptlrpc_prep_bulk_page); | |
183 | ||
184 | /** | |
185 | * Uninitialize and free bulk descriptor \a desc. | |
186 | * Works on bulk descriptors both from server and client side. | |
187 | */ | |
188 | void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc, int unpin) | |
189 | { | |
190 | int i; | |
d7e09d03 PT |
191 | |
192 | LASSERT(desc != NULL); | |
193 | LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */ | |
194 | LASSERT(desc->bd_md_count == 0); /* network hands off */ | |
195 | LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL)); | |
196 | ||
197 | sptlrpc_enc_pool_put_pages(desc); | |
198 | ||
199 | if (desc->bd_export) | |
200 | class_export_put(desc->bd_export); | |
201 | else | |
202 | class_import_put(desc->bd_import); | |
203 | ||
204 | if (unpin) { | |
7b8633de | 205 | for (i = 0; i < desc->bd_iov_count; i++) |
d7e09d03 PT |
206 | page_cache_release(desc->bd_iov[i].kiov_page); |
207 | } | |
208 | ||
9ae10597 | 209 | kfree(desc); |
d7e09d03 PT |
210 | } |
211 | EXPORT_SYMBOL(__ptlrpc_free_bulk); | |
212 | ||
213 | /** | |
214 | * Set server timelimit for this req, i.e. how long are we willing to wait | |
215 | * for reply before timing out this request. | |
216 | */ | |
217 | void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req) | |
218 | { | |
219 | __u32 serv_est; | |
220 | int idx; | |
221 | struct imp_at *at; | |
222 | ||
223 | LASSERT(req->rq_import); | |
224 | ||
225 | if (AT_OFF) { | |
226 | /* non-AT settings */ | |
227 | /** | |
228 | * \a imp_server_timeout means this is reverse import and | |
229 | * we send (currently only) ASTs to the client and cannot afford | |
230 | * to wait too long for the reply, otherwise the other client | |
231 | * (because of which we are sending this request) would | |
232 | * timeout waiting for us | |
233 | */ | |
234 | req->rq_timeout = req->rq_import->imp_server_timeout ? | |
235 | obd_timeout / 2 : obd_timeout; | |
236 | } else { | |
237 | at = &req->rq_import->imp_at; | |
238 | idx = import_at_get_index(req->rq_import, | |
239 | req->rq_request_portal); | |
240 | serv_est = at_get(&at->iat_service_estimate[idx]); | |
241 | req->rq_timeout = at_est2timeout(serv_est); | |
242 | } | |
243 | /* We could get even fancier here, using history to predict increased | |
244 | loading... */ | |
245 | ||
246 | /* Let the server know what this RPC timeout is by putting it in the | |
247 | reqmsg*/ | |
248 | lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout); | |
249 | } | |
250 | EXPORT_SYMBOL(ptlrpc_at_set_req_timeout); | |
251 | ||
252 | /* Adjust max service estimate based on server value */ | |
253 | static void ptlrpc_at_adj_service(struct ptlrpc_request *req, | |
254 | unsigned int serv_est) | |
255 | { | |
256 | int idx; | |
257 | unsigned int oldse; | |
258 | struct imp_at *at; | |
259 | ||
260 | LASSERT(req->rq_import); | |
261 | at = &req->rq_import->imp_at; | |
262 | ||
263 | idx = import_at_get_index(req->rq_import, req->rq_request_portal); | |
264 | /* max service estimates are tracked on the server side, | |
265 | so just keep minimal history here */ | |
266 | oldse = at_measured(&at->iat_service_estimate[idx], serv_est); | |
267 | if (oldse != 0) | |
2d00bd17 | 268 | CDEBUG(D_ADAPTTO, "The RPC service estimate for %s ptl %d has changed from %d to %d\n", |
0ae015be | 269 | req->rq_import->imp_obd->obd_name, req->rq_request_portal, |
d7e09d03 PT |
270 | oldse, at_get(&at->iat_service_estimate[idx])); |
271 | } | |
272 | ||
273 | /* Expected network latency per remote node (secs) */ | |
274 | int ptlrpc_at_get_net_latency(struct ptlrpc_request *req) | |
275 | { | |
276 | return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency); | |
277 | } | |
278 | ||
279 | /* Adjust expected network latency */ | |
280 | static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req, | |
281 | unsigned int service_time) | |
282 | { | |
283 | unsigned int nl, oldnl; | |
284 | struct imp_at *at; | |
7264b8a5 | 285 | time_t now = get_seconds(); |
d7e09d03 PT |
286 | |
287 | LASSERT(req->rq_import); | |
06fbc01a LZ |
288 | |
289 | if (service_time > now - req->rq_sent + 3) { | |
290 | /* bz16408, however, this can also happen if early reply | |
291 | * is lost and client RPC is expired and resent, early reply | |
292 | * or reply of original RPC can still be fit in reply buffer | |
293 | * of resent RPC, now client is measuring time from the | |
294 | * resent time, but server sent back service time of original | |
295 | * RPC. | |
296 | */ | |
297 | CDEBUG((lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) ? | |
298 | D_ADAPTTO : D_WARNING, | |
299 | "Reported service time %u > total measured time " | |
300 | CFS_DURATION_T"\n", service_time, | |
301 | cfs_time_sub(now, req->rq_sent)); | |
302 | return; | |
303 | } | |
d7e09d03 PT |
304 | |
305 | /* Network latency is total time less server processing time */ | |
06fbc01a LZ |
306 | nl = max_t(int, now - req->rq_sent - |
307 | service_time, 0) + 1; /* st rounding */ | |
308 | at = &req->rq_import->imp_at; | |
d7e09d03 PT |
309 | |
310 | oldnl = at_measured(&at->iat_net_latency, nl); | |
311 | if (oldnl != 0) | |
2d00bd17 | 312 | CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s) has changed from %d to %d\n", |
d7e09d03 PT |
313 | req->rq_import->imp_obd->obd_name, |
314 | obd_uuid2str( | |
315 | &req->rq_import->imp_connection->c_remote_uuid), | |
316 | oldnl, at_get(&at->iat_net_latency)); | |
317 | } | |
318 | ||
319 | static int unpack_reply(struct ptlrpc_request *req) | |
320 | { | |
321 | int rc; | |
322 | ||
323 | if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) { | |
324 | rc = ptlrpc_unpack_rep_msg(req, req->rq_replen); | |
325 | if (rc) { | |
326 | DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc); | |
fbe7c6c7 | 327 | return -EPROTO; |
d7e09d03 PT |
328 | } |
329 | } | |
330 | ||
331 | rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); | |
332 | if (rc) { | |
333 | DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc); | |
fbe7c6c7 | 334 | return -EPROTO; |
d7e09d03 PT |
335 | } |
336 | return 0; | |
337 | } | |
338 | ||
339 | /** | |
340 | * Handle an early reply message, called with the rq_lock held. | |
341 | * If anything goes wrong just ignore it - same as if it never happened | |
342 | */ | |
343 | static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) | |
344 | { | |
345 | struct ptlrpc_request *early_req; | |
d0bfef31 CH |
346 | time_t olddl; |
347 | int rc; | |
d7e09d03 PT |
348 | |
349 | req->rq_early = 0; | |
350 | spin_unlock(&req->rq_lock); | |
351 | ||
352 | rc = sptlrpc_cli_unwrap_early_reply(req, &early_req); | |
353 | if (rc) { | |
354 | spin_lock(&req->rq_lock); | |
0a3bdb00 | 355 | return rc; |
d7e09d03 PT |
356 | } |
357 | ||
358 | rc = unpack_reply(early_req); | |
359 | if (rc == 0) { | |
360 | /* Expecting to increase the service time estimate here */ | |
361 | ptlrpc_at_adj_service(req, | |
362 | lustre_msg_get_timeout(early_req->rq_repmsg)); | |
363 | ptlrpc_at_adj_net_latency(req, | |
364 | lustre_msg_get_service_time(early_req->rq_repmsg)); | |
365 | } | |
366 | ||
367 | sptlrpc_cli_finish_early_reply(early_req); | |
368 | ||
369 | if (rc != 0) { | |
370 | spin_lock(&req->rq_lock); | |
0a3bdb00 | 371 | return rc; |
d7e09d03 PT |
372 | } |
373 | ||
374 | /* Adjust the local timeout for this req */ | |
375 | ptlrpc_at_set_req_timeout(req); | |
376 | ||
377 | spin_lock(&req->rq_lock); | |
378 | olddl = req->rq_deadline; | |
379 | /* server assumes it now has rq_timeout from when it sent the | |
380 | * early reply, so client should give it at least that long. */ | |
7264b8a5 | 381 | req->rq_deadline = get_seconds() + req->rq_timeout + |
d7e09d03 PT |
382 | ptlrpc_at_get_net_latency(req); |
383 | ||
384 | DEBUG_REQ(D_ADAPTTO, req, | |
2d00bd17 JP |
385 | "Early reply #%d, new deadline in " CFS_DURATION_T "s (" CFS_DURATION_T "s)", |
386 | req->rq_early_count, | |
7264b8a5 | 387 | cfs_time_sub(req->rq_deadline, get_seconds()), |
d7e09d03 PT |
388 | cfs_time_sub(req->rq_deadline, olddl)); |
389 | ||
0a3bdb00 | 390 | return rc; |
d7e09d03 PT |
391 | } |
392 | ||
7257f9d1 | 393 | static struct kmem_cache *request_cache; |
35b2e1b7 AS |
394 | |
395 | int ptlrpc_request_cache_init(void) | |
396 | { | |
397 | request_cache = kmem_cache_create("ptlrpc_cache", | |
398 | sizeof(struct ptlrpc_request), | |
399 | 0, SLAB_HWCACHE_ALIGN, NULL); | |
400 | return request_cache == NULL ? -ENOMEM : 0; | |
401 | } | |
402 | ||
403 | void ptlrpc_request_cache_fini(void) | |
404 | { | |
405 | kmem_cache_destroy(request_cache); | |
406 | } | |
407 | ||
f1c571dc | 408 | struct ptlrpc_request *ptlrpc_request_cache_alloc(gfp_t flags) |
35b2e1b7 AS |
409 | { |
410 | struct ptlrpc_request *req; | |
411 | ||
412 | OBD_SLAB_ALLOC_PTR_GFP(req, request_cache, flags); | |
413 | return req; | |
414 | } | |
415 | ||
416 | void ptlrpc_request_cache_free(struct ptlrpc_request *req) | |
417 | { | |
418 | OBD_SLAB_FREE_PTR(req, request_cache); | |
419 | } | |
420 | ||
d7e09d03 PT |
421 | /** |
422 | * Wind down request pool \a pool. | |
423 | * Frees all requests from the pool too | |
424 | */ | |
425 | void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool) | |
426 | { | |
427 | struct list_head *l, *tmp; | |
428 | struct ptlrpc_request *req; | |
429 | ||
430 | LASSERT(pool != NULL); | |
431 | ||
432 | spin_lock(&pool->prp_lock); | |
433 | list_for_each_safe(l, tmp, &pool->prp_req_list) { | |
434 | req = list_entry(l, struct ptlrpc_request, rq_list); | |
435 | list_del(&req->rq_list); | |
436 | LASSERT(req->rq_reqbuf); | |
437 | LASSERT(req->rq_reqbuf_len == pool->prp_rq_size); | |
ee0ec194 | 438 | kvfree(req->rq_reqbuf); |
35b2e1b7 | 439 | ptlrpc_request_cache_free(req); |
d7e09d03 PT |
440 | } |
441 | spin_unlock(&pool->prp_lock); | |
9ae10597 | 442 | kfree(pool); |
d7e09d03 PT |
443 | } |
444 | EXPORT_SYMBOL(ptlrpc_free_rq_pool); | |
445 | ||
446 | /** | |
447 | * Allocates, initializes and adds \a num_rq requests to the pool \a pool | |
448 | */ | |
449 | void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq) | |
450 | { | |
451 | int i; | |
452 | int size = 1; | |
453 | ||
454 | while (size < pool->prp_rq_size) | |
455 | size <<= 1; | |
456 | ||
457 | LASSERTF(list_empty(&pool->prp_req_list) || | |
458 | size == pool->prp_rq_size, | |
2d00bd17 JP |
459 | "Trying to change pool size with nonempty pool from %d to %d bytes\n", |
460 | pool->prp_rq_size, size); | |
d7e09d03 PT |
461 | |
462 | spin_lock(&pool->prp_lock); | |
463 | pool->prp_rq_size = size; | |
464 | for (i = 0; i < num_rq; i++) { | |
465 | struct ptlrpc_request *req; | |
466 | struct lustre_msg *msg; | |
467 | ||
468 | spin_unlock(&pool->prp_lock); | |
0be19afa | 469 | req = ptlrpc_request_cache_alloc(GFP_NOFS); |
d7e09d03 PT |
470 | if (!req) |
471 | return; | |
ee0ec194 | 472 | msg = libcfs_kvzalloc(size, GFP_NOFS); |
d7e09d03 | 473 | if (!msg) { |
35b2e1b7 | 474 | ptlrpc_request_cache_free(req); |
d7e09d03 PT |
475 | return; |
476 | } | |
477 | req->rq_reqbuf = msg; | |
478 | req->rq_reqbuf_len = size; | |
479 | req->rq_pool = pool; | |
480 | spin_lock(&pool->prp_lock); | |
481 | list_add_tail(&req->rq_list, &pool->prp_req_list); | |
482 | } | |
483 | spin_unlock(&pool->prp_lock); | |
d7e09d03 PT |
484 | } |
485 | EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool); | |
486 | ||
487 | /** | |
488 | * Create and initialize new request pool with given attributes: | |
489 | * \a num_rq - initial number of requests to create for the pool | |
490 | * \a msgsize - maximum message size possible for requests in thid pool | |
491 | * \a populate_pool - function to be called when more requests need to be added | |
492 | * to the pool | |
493 | * Returns pointer to newly created pool or NULL on error. | |
494 | */ | |
495 | struct ptlrpc_request_pool * | |
496 | ptlrpc_init_rq_pool(int num_rq, int msgsize, | |
497 | void (*populate_pool)(struct ptlrpc_request_pool *, int)) | |
498 | { | |
499 | struct ptlrpc_request_pool *pool; | |
500 | ||
9ae10597 | 501 | pool = kzalloc(sizeof(struct ptlrpc_request_pool), GFP_NOFS); |
d7e09d03 PT |
502 | if (!pool) |
503 | return NULL; | |
504 | ||
505 | /* Request next power of two for the allocation, because internally | |
506 | kernel would do exactly this */ | |
507 | ||
508 | spin_lock_init(&pool->prp_lock); | |
509 | INIT_LIST_HEAD(&pool->prp_req_list); | |
510 | pool->prp_rq_size = msgsize + SPTLRPC_MAX_PAYLOAD; | |
511 | pool->prp_populate = populate_pool; | |
512 | ||
513 | populate_pool(pool, num_rq); | |
514 | ||
515 | if (list_empty(&pool->prp_req_list)) { | |
516 | /* have not allocated a single request for the pool */ | |
9ae10597 | 517 | kfree(pool); |
d7e09d03 PT |
518 | pool = NULL; |
519 | } | |
520 | return pool; | |
521 | } | |
522 | EXPORT_SYMBOL(ptlrpc_init_rq_pool); | |
523 | ||
524 | /** | |
525 | * Fetches one request from pool \a pool | |
526 | */ | |
527 | static struct ptlrpc_request * | |
528 | ptlrpc_prep_req_from_pool(struct ptlrpc_request_pool *pool) | |
529 | { | |
530 | struct ptlrpc_request *request; | |
531 | struct lustre_msg *reqbuf; | |
532 | ||
533 | if (!pool) | |
534 | return NULL; | |
535 | ||
536 | spin_lock(&pool->prp_lock); | |
537 | ||
538 | /* See if we have anything in a pool, and bail out if nothing, | |
539 | * in writeout path, where this matters, this is safe to do, because | |
540 | * nothing is lost in this case, and when some in-flight requests | |
541 | * complete, this code will be called again. */ | |
542 | if (unlikely(list_empty(&pool->prp_req_list))) { | |
543 | spin_unlock(&pool->prp_lock); | |
544 | return NULL; | |
545 | } | |
546 | ||
547 | request = list_entry(pool->prp_req_list.next, struct ptlrpc_request, | |
548 | rq_list); | |
549 | list_del_init(&request->rq_list); | |
550 | spin_unlock(&pool->prp_lock); | |
551 | ||
552 | LASSERT(request->rq_reqbuf); | |
553 | LASSERT(request->rq_pool); | |
554 | ||
555 | reqbuf = request->rq_reqbuf; | |
556 | memset(request, 0, sizeof(*request)); | |
557 | request->rq_reqbuf = reqbuf; | |
558 | request->rq_reqbuf_len = pool->prp_rq_size; | |
559 | request->rq_pool = pool; | |
560 | ||
561 | return request; | |
562 | } | |
563 | ||
564 | /** | |
565 | * Returns freed \a request to pool. | |
566 | */ | |
567 | static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request) | |
568 | { | |
569 | struct ptlrpc_request_pool *pool = request->rq_pool; | |
570 | ||
571 | spin_lock(&pool->prp_lock); | |
572 | LASSERT(list_empty(&request->rq_list)); | |
573 | LASSERT(!request->rq_receiving_reply); | |
574 | list_add_tail(&request->rq_list, &pool->prp_req_list); | |
575 | spin_unlock(&pool->prp_lock); | |
576 | } | |
577 | ||
578 | static int __ptlrpc_request_bufs_pack(struct ptlrpc_request *request, | |
579 | __u32 version, int opcode, | |
580 | int count, __u32 *lengths, char **bufs, | |
581 | struct ptlrpc_cli_ctx *ctx) | |
582 | { | |
d0bfef31 CH |
583 | struct obd_import *imp = request->rq_import; |
584 | int rc; | |
d7e09d03 PT |
585 | |
586 | if (unlikely(ctx)) | |
587 | request->rq_cli_ctx = sptlrpc_cli_ctx_get(ctx); | |
588 | else { | |
589 | rc = sptlrpc_req_get_ctx(request); | |
590 | if (rc) | |
a9b3e8f3 | 591 | goto out_free; |
d7e09d03 PT |
592 | } |
593 | ||
594 | sptlrpc_req_set_flavor(request, opcode); | |
595 | ||
596 | rc = lustre_pack_request(request, imp->imp_msg_magic, count, | |
597 | lengths, bufs); | |
598 | if (rc) { | |
599 | LASSERT(!request->rq_pool); | |
a9b3e8f3 | 600 | goto out_ctx; |
d7e09d03 PT |
601 | } |
602 | ||
603 | lustre_msg_add_version(request->rq_reqmsg, version); | |
604 | request->rq_send_state = LUSTRE_IMP_FULL; | |
605 | request->rq_type = PTL_RPC_MSG_REQUEST; | |
606 | request->rq_export = NULL; | |
607 | ||
d0bfef31 | 608 | request->rq_req_cbid.cbid_fn = request_out_callback; |
d7e09d03 PT |
609 | request->rq_req_cbid.cbid_arg = request; |
610 | ||
d0bfef31 | 611 | request->rq_reply_cbid.cbid_fn = reply_in_callback; |
d7e09d03 PT |
612 | request->rq_reply_cbid.cbid_arg = request; |
613 | ||
614 | request->rq_reply_deadline = 0; | |
615 | request->rq_phase = RQ_PHASE_NEW; | |
616 | request->rq_next_phase = RQ_PHASE_UNDEFINED; | |
617 | ||
618 | request->rq_request_portal = imp->imp_client->cli_request_portal; | |
619 | request->rq_reply_portal = imp->imp_client->cli_reply_portal; | |
620 | ||
621 | ptlrpc_at_set_req_timeout(request); | |
622 | ||
623 | spin_lock_init(&request->rq_lock); | |
624 | INIT_LIST_HEAD(&request->rq_list); | |
625 | INIT_LIST_HEAD(&request->rq_timed_list); | |
626 | INIT_LIST_HEAD(&request->rq_replay_list); | |
627 | INIT_LIST_HEAD(&request->rq_ctx_chain); | |
628 | INIT_LIST_HEAD(&request->rq_set_chain); | |
629 | INIT_LIST_HEAD(&request->rq_history_list); | |
630 | INIT_LIST_HEAD(&request->rq_exp_list); | |
631 | init_waitqueue_head(&request->rq_reply_waitq); | |
632 | init_waitqueue_head(&request->rq_set_waitq); | |
633 | request->rq_xid = ptlrpc_next_xid(); | |
634 | atomic_set(&request->rq_refcount, 1); | |
635 | ||
636 | lustre_msg_set_opc(request->rq_reqmsg, opcode); | |
637 | ||
0a3bdb00 | 638 | return 0; |
d7e09d03 PT |
639 | out_ctx: |
640 | sptlrpc_cli_ctx_put(request->rq_cli_ctx, 1); | |
641 | out_free: | |
642 | class_import_put(imp); | |
643 | return rc; | |
644 | } | |
645 | ||
646 | int ptlrpc_request_bufs_pack(struct ptlrpc_request *request, | |
647 | __u32 version, int opcode, char **bufs, | |
648 | struct ptlrpc_cli_ctx *ctx) | |
649 | { | |
650 | int count; | |
651 | ||
652 | count = req_capsule_filled_sizes(&request->rq_pill, RCL_CLIENT); | |
653 | return __ptlrpc_request_bufs_pack(request, version, opcode, count, | |
654 | request->rq_pill.rc_area[RCL_CLIENT], | |
655 | bufs, ctx); | |
656 | } | |
657 | EXPORT_SYMBOL(ptlrpc_request_bufs_pack); | |
658 | ||
659 | /** | |
660 | * Pack request buffers for network transfer, performing necessary encryption | |
661 | * steps if necessary. | |
662 | */ | |
663 | int ptlrpc_request_pack(struct ptlrpc_request *request, | |
664 | __u32 version, int opcode) | |
665 | { | |
666 | int rc; | |
667 | rc = ptlrpc_request_bufs_pack(request, version, opcode, NULL, NULL); | |
668 | if (rc) | |
669 | return rc; | |
670 | ||
671 | /* For some old 1.8 clients (< 1.8.7), they will LASSERT the size of | |
672 | * ptlrpc_body sent from server equal to local ptlrpc_body size, so we | |
930cef9a | 673 | * have to send old ptlrpc_body to keep interoperability with these |
d7e09d03 PT |
674 | * clients. |
675 | * | |
676 | * Only three kinds of server->client RPCs so far: | |
677 | * - LDLM_BL_CALLBACK | |
678 | * - LDLM_CP_CALLBACK | |
679 | * - LDLM_GL_CALLBACK | |
680 | * | |
930cef9a | 681 | * XXX This should be removed whenever we drop the interoperability with |
d7e09d03 PT |
682 | * the these old clients. |
683 | */ | |
684 | if (opcode == LDLM_BL_CALLBACK || opcode == LDLM_CP_CALLBACK || | |
685 | opcode == LDLM_GL_CALLBACK) | |
686 | req_capsule_shrink(&request->rq_pill, &RMF_PTLRPC_BODY, | |
687 | sizeof(struct ptlrpc_body_v2), RCL_CLIENT); | |
688 | ||
689 | return rc; | |
690 | } | |
691 | EXPORT_SYMBOL(ptlrpc_request_pack); | |
692 | ||
693 | /** | |
694 | * Helper function to allocate new request on import \a imp | |
695 | * and possibly using existing request from pool \a pool if provided. | |
696 | * Returns allocated request structure with import field filled or | |
697 | * NULL on error. | |
698 | */ | |
699 | static inline | |
700 | struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp, | |
701 | struct ptlrpc_request_pool *pool) | |
702 | { | |
703 | struct ptlrpc_request *request = NULL; | |
704 | ||
705 | if (pool) | |
706 | request = ptlrpc_prep_req_from_pool(pool); | |
707 | ||
708 | if (!request) | |
0be19afa | 709 | request = ptlrpc_request_cache_alloc(GFP_NOFS); |
d7e09d03 PT |
710 | |
711 | if (request) { | |
712 | LASSERTF((unsigned long)imp > 0x1000, "%p", imp); | |
713 | LASSERT(imp != LP_POISON); | |
714 | LASSERTF((unsigned long)imp->imp_client > 0x1000, "%p", | |
715 | imp->imp_client); | |
716 | LASSERT(imp->imp_client != LP_POISON); | |
717 | ||
718 | request->rq_import = class_import_get(imp); | |
719 | } else { | |
720 | CERROR("request allocation out of memory\n"); | |
721 | } | |
722 | ||
723 | return request; | |
724 | } | |
725 | ||
726 | /** | |
727 | * Helper function for creating a request. | |
930cef9a | 728 | * Calls __ptlrpc_request_alloc to allocate new request structure and inits |
d7e09d03 PT |
729 | * buffer structures according to capsule template \a format. |
730 | * Returns allocated request structure pointer or NULL on error. | |
731 | */ | |
732 | static struct ptlrpc_request * | |
733 | ptlrpc_request_alloc_internal(struct obd_import *imp, | |
0028d585 | 734 | struct ptlrpc_request_pool *pool, |
d7e09d03 PT |
735 | const struct req_format *format) |
736 | { | |
737 | struct ptlrpc_request *request; | |
738 | ||
739 | request = __ptlrpc_request_alloc(imp, pool); | |
740 | if (request == NULL) | |
741 | return NULL; | |
742 | ||
743 | req_capsule_init(&request->rq_pill, request, RCL_CLIENT); | |
744 | req_capsule_set(&request->rq_pill, format); | |
745 | return request; | |
746 | } | |
747 | ||
748 | /** | |
749 | * Allocate new request structure for import \a imp and initialize its | |
750 | * buffer structure according to capsule template \a format. | |
751 | */ | |
752 | struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp, | |
753 | const struct req_format *format) | |
754 | { | |
755 | return ptlrpc_request_alloc_internal(imp, NULL, format); | |
756 | } | |
757 | EXPORT_SYMBOL(ptlrpc_request_alloc); | |
758 | ||
759 | /** | |
760 | * Allocate new request structure for import \a imp from pool \a pool and | |
761 | * initialize its buffer structure according to capsule template \a format. | |
762 | */ | |
763 | struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp, | |
d0bfef31 CH |
764 | struct ptlrpc_request_pool *pool, |
765 | const struct req_format *format) | |
d7e09d03 PT |
766 | { |
767 | return ptlrpc_request_alloc_internal(imp, pool, format); | |
768 | } | |
769 | EXPORT_SYMBOL(ptlrpc_request_alloc_pool); | |
770 | ||
771 | /** | |
772 | * For requests not from pool, free memory of the request structure. | |
773 | * For requests obtained from a pool earlier, return request back to pool. | |
774 | */ | |
775 | void ptlrpc_request_free(struct ptlrpc_request *request) | |
776 | { | |
777 | if (request->rq_pool) | |
778 | __ptlrpc_free_req_to_pool(request); | |
779 | else | |
35b2e1b7 | 780 | ptlrpc_request_cache_free(request); |
d7e09d03 PT |
781 | } |
782 | EXPORT_SYMBOL(ptlrpc_request_free); | |
783 | ||
784 | /** | |
930cef9a | 785 | * Allocate new request for operation \a opcode and immediately pack it for |
d7e09d03 PT |
786 | * network transfer. |
787 | * Only used for simple requests like OBD_PING where the only important | |
788 | * part of the request is operation itself. | |
789 | * Returns allocated request or NULL on error. | |
790 | */ | |
791 | struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp, | |
d0bfef31 CH |
792 | const struct req_format *format, |
793 | __u32 version, int opcode) | |
d7e09d03 PT |
794 | { |
795 | struct ptlrpc_request *req = ptlrpc_request_alloc(imp, format); | |
d0bfef31 | 796 | int rc; |
d7e09d03 PT |
797 | |
798 | if (req) { | |
799 | rc = ptlrpc_request_pack(req, version, opcode); | |
800 | if (rc) { | |
801 | ptlrpc_request_free(req); | |
802 | req = NULL; | |
803 | } | |
804 | } | |
805 | return req; | |
806 | } | |
807 | EXPORT_SYMBOL(ptlrpc_request_alloc_pack); | |
808 | ||
809 | /** | |
930cef9a | 810 | * Prepare request (fetched from pool \a pool if not NULL) on import \a imp |
d7e09d03 PT |
811 | * for operation \a opcode. Request would contain \a count buffers. |
812 | * Sizes of buffers are described in array \a lengths and buffers themselves | |
813 | * are provided by a pointer \a bufs. | |
814 | * Returns prepared request structure pointer or NULL on error. | |
815 | */ | |
816 | struct ptlrpc_request * | |
817 | ptlrpc_prep_req_pool(struct obd_import *imp, | |
818 | __u32 version, int opcode, | |
819 | int count, __u32 *lengths, char **bufs, | |
820 | struct ptlrpc_request_pool *pool) | |
821 | { | |
822 | struct ptlrpc_request *request; | |
d0bfef31 | 823 | int rc; |
d7e09d03 PT |
824 | |
825 | request = __ptlrpc_request_alloc(imp, pool); | |
826 | if (!request) | |
827 | return NULL; | |
828 | ||
829 | rc = __ptlrpc_request_bufs_pack(request, version, opcode, count, | |
830 | lengths, bufs, NULL); | |
831 | if (rc) { | |
832 | ptlrpc_request_free(request); | |
833 | request = NULL; | |
834 | } | |
835 | return request; | |
836 | } | |
837 | EXPORT_SYMBOL(ptlrpc_prep_req_pool); | |
838 | ||
839 | /** | |
840 | * Same as ptlrpc_prep_req_pool, but without pool | |
841 | */ | |
842 | struct ptlrpc_request * | |
843 | ptlrpc_prep_req(struct obd_import *imp, __u32 version, int opcode, int count, | |
844 | __u32 *lengths, char **bufs) | |
845 | { | |
846 | return ptlrpc_prep_req_pool(imp, version, opcode, count, lengths, bufs, | |
847 | NULL); | |
848 | } | |
849 | EXPORT_SYMBOL(ptlrpc_prep_req); | |
850 | ||
851 | /** | |
852 | * Allocate and initialize new request set structure. | |
853 | * Returns a pointer to the newly allocated set structure or NULL on error. | |
854 | */ | |
855 | struct ptlrpc_request_set *ptlrpc_prep_set(void) | |
856 | { | |
857 | struct ptlrpc_request_set *set; | |
858 | ||
9ae10597 | 859 | set = kzalloc(sizeof(*set), GFP_NOFS); |
d7e09d03 | 860 | if (!set) |
0a3bdb00 | 861 | return NULL; |
d7e09d03 PT |
862 | atomic_set(&set->set_refcount, 1); |
863 | INIT_LIST_HEAD(&set->set_requests); | |
864 | init_waitqueue_head(&set->set_waitq); | |
865 | atomic_set(&set->set_new_count, 0); | |
866 | atomic_set(&set->set_remaining, 0); | |
867 | spin_lock_init(&set->set_new_req_lock); | |
868 | INIT_LIST_HEAD(&set->set_new_requests); | |
869 | INIT_LIST_HEAD(&set->set_cblist); | |
870 | set->set_max_inflight = UINT_MAX; | |
d0bfef31 | 871 | set->set_producer = NULL; |
d7e09d03 | 872 | set->set_producer_arg = NULL; |
d0bfef31 | 873 | set->set_rc = 0; |
d7e09d03 | 874 | |
0a3bdb00 | 875 | return set; |
d7e09d03 PT |
876 | } |
877 | EXPORT_SYMBOL(ptlrpc_prep_set); | |
878 | ||
879 | /** | |
880 | * Allocate and initialize new request set structure with flow control | |
881 | * extension. This extension allows to control the number of requests in-flight | |
882 | * for the whole set. A callback function to generate requests must be provided | |
883 | * and the request set will keep the number of requests sent over the wire to | |
884 | * @max_inflight. | |
885 | * Returns a pointer to the newly allocated set structure or NULL on error. | |
886 | */ | |
887 | struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func, | |
888 | void *arg) | |
889 | ||
890 | { | |
891 | struct ptlrpc_request_set *set; | |
892 | ||
893 | set = ptlrpc_prep_set(); | |
894 | if (!set) | |
0a3bdb00 | 895 | return NULL; |
d7e09d03 | 896 | |
d0bfef31 CH |
897 | set->set_max_inflight = max; |
898 | set->set_producer = func; | |
899 | set->set_producer_arg = arg; | |
d7e09d03 | 900 | |
0a3bdb00 | 901 | return set; |
d7e09d03 PT |
902 | } |
903 | EXPORT_SYMBOL(ptlrpc_prep_fcset); | |
904 | ||
905 | /** | |
906 | * Wind down and free request set structure previously allocated with | |
907 | * ptlrpc_prep_set. | |
908 | * Ensures that all requests on the set have completed and removes | |
909 | * all requests from the request list in a set. | |
910 | * If any unsent request happen to be on the list, pretends that they got | |
911 | * an error in flight and calls their completion handler. | |
912 | */ | |
913 | void ptlrpc_set_destroy(struct ptlrpc_request_set *set) | |
914 | { | |
d0bfef31 CH |
915 | struct list_head *tmp; |
916 | struct list_head *next; | |
917 | int expected_phase; | |
918 | int n = 0; | |
d7e09d03 PT |
919 | |
920 | /* Requests on the set should either all be completed, or all be new */ | |
921 | expected_phase = (atomic_read(&set->set_remaining) == 0) ? | |
922 | RQ_PHASE_COMPLETE : RQ_PHASE_NEW; | |
3949015e | 923 | list_for_each(tmp, &set->set_requests) { |
d7e09d03 PT |
924 | struct ptlrpc_request *req = |
925 | list_entry(tmp, struct ptlrpc_request, | |
926 | rq_set_chain); | |
927 | ||
928 | LASSERT(req->rq_phase == expected_phase); | |
929 | n++; | |
930 | } | |
931 | ||
932 | LASSERTF(atomic_read(&set->set_remaining) == 0 || | |
933 | atomic_read(&set->set_remaining) == n, "%d / %d\n", | |
934 | atomic_read(&set->set_remaining), n); | |
935 | ||
936 | list_for_each_safe(tmp, next, &set->set_requests) { | |
937 | struct ptlrpc_request *req = | |
938 | list_entry(tmp, struct ptlrpc_request, | |
939 | rq_set_chain); | |
940 | list_del_init(&req->rq_set_chain); | |
941 | ||
942 | LASSERT(req->rq_phase == expected_phase); | |
943 | ||
944 | if (req->rq_phase == RQ_PHASE_NEW) { | |
945 | ptlrpc_req_interpret(NULL, req, -EBADR); | |
946 | atomic_dec(&set->set_remaining); | |
947 | } | |
948 | ||
949 | spin_lock(&req->rq_lock); | |
950 | req->rq_set = NULL; | |
951 | req->rq_invalid_rqset = 0; | |
952 | spin_unlock(&req->rq_lock); | |
953 | ||
3949015e | 954 | ptlrpc_req_finished(req); |
d7e09d03 PT |
955 | } |
956 | ||
957 | LASSERT(atomic_read(&set->set_remaining) == 0); | |
958 | ||
959 | ptlrpc_reqset_put(set); | |
d7e09d03 PT |
960 | } |
961 | EXPORT_SYMBOL(ptlrpc_set_destroy); | |
962 | ||
963 | /** | |
964 | * Add a callback function \a fn to the set. | |
965 | * This function would be called when all requests on this set are completed. | |
966 | * The function will be passed \a data argument. | |
967 | */ | |
968 | int ptlrpc_set_add_cb(struct ptlrpc_request_set *set, | |
969 | set_interpreter_func fn, void *data) | |
970 | { | |
971 | struct ptlrpc_set_cbdata *cbdata; | |
972 | ||
9ae10597 | 973 | cbdata = kzalloc(sizeof(*cbdata), GFP_NOFS); |
597851ac | 974 | if (!cbdata) |
0a3bdb00 | 975 | return -ENOMEM; |
d7e09d03 PT |
976 | |
977 | cbdata->psc_interpret = fn; | |
978 | cbdata->psc_data = data; | |
979 | list_add_tail(&cbdata->psc_item, &set->set_cblist); | |
980 | ||
0a3bdb00 | 981 | return 0; |
d7e09d03 PT |
982 | } |
983 | EXPORT_SYMBOL(ptlrpc_set_add_cb); | |
984 | ||
985 | /** | |
986 | * Add a new request to the general purpose request set. | |
987 | * Assumes request reference from the caller. | |
988 | */ | |
989 | void ptlrpc_set_add_req(struct ptlrpc_request_set *set, | |
990 | struct ptlrpc_request *req) | |
991 | { | |
992 | LASSERT(list_empty(&req->rq_set_chain)); | |
993 | ||
994 | /* The set takes over the caller's request reference */ | |
995 | list_add_tail(&req->rq_set_chain, &set->set_requests); | |
996 | req->rq_set = set; | |
997 | atomic_inc(&set->set_remaining); | |
998 | req->rq_queued_time = cfs_time_current(); | |
999 | ||
1000 | if (req->rq_reqmsg != NULL) | |
1001 | lustre_msg_set_jobid(req->rq_reqmsg, NULL); | |
1002 | ||
1003 | if (set->set_producer != NULL) | |
1004 | /* If the request set has a producer callback, the RPC must be | |
1005 | * sent straight away */ | |
1006 | ptlrpc_send_new_req(req); | |
1007 | } | |
1008 | EXPORT_SYMBOL(ptlrpc_set_add_req); | |
1009 | ||
1010 | /** | |
1011 | * Add a request to a request with dedicated server thread | |
1012 | * and wake the thread to make any necessary processing. | |
1013 | * Currently only used for ptlrpcd. | |
1014 | */ | |
1015 | void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc, | |
d0bfef31 | 1016 | struct ptlrpc_request *req) |
d7e09d03 PT |
1017 | { |
1018 | struct ptlrpc_request_set *set = pc->pc_set; | |
1019 | int count, i; | |
1020 | ||
1021 | LASSERT(req->rq_set == NULL); | |
1022 | LASSERT(test_bit(LIOD_STOP, &pc->pc_flags) == 0); | |
1023 | ||
1024 | spin_lock(&set->set_new_req_lock); | |
1025 | /* | |
1026 | * The set takes over the caller's request reference. | |
1027 | */ | |
1028 | req->rq_set = set; | |
1029 | req->rq_queued_time = cfs_time_current(); | |
1030 | list_add_tail(&req->rq_set_chain, &set->set_new_requests); | |
1031 | count = atomic_inc_return(&set->set_new_count); | |
1032 | spin_unlock(&set->set_new_req_lock); | |
1033 | ||
1034 | /* Only need to call wakeup once for the first entry. */ | |
1035 | if (count == 1) { | |
1036 | wake_up(&set->set_waitq); | |
1037 | ||
1038 | /* XXX: It maybe unnecessary to wakeup all the partners. But to | |
1039 | * guarantee the async RPC can be processed ASAP, we have | |
1040 | * no other better choice. It maybe fixed in future. */ | |
1041 | for (i = 0; i < pc->pc_npartners; i++) | |
1042 | wake_up(&pc->pc_partners[i]->pc_set->set_waitq); | |
1043 | } | |
1044 | } | |
1045 | EXPORT_SYMBOL(ptlrpc_set_add_new_req); | |
1046 | ||
1047 | /** | |
1048 | * Based on the current state of the import, determine if the request | |
1049 | * can be sent, is an error, or should be delayed. | |
1050 | * | |
1051 | * Returns true if this request should be delayed. If false, and | |
1052 | * *status is set, then the request can not be sent and *status is the | |
1053 | * error code. If false and status is 0, then request can be sent. | |
1054 | * | |
1055 | * The imp->imp_lock must be held. | |
1056 | */ | |
1057 | static int ptlrpc_import_delay_req(struct obd_import *imp, | |
1058 | struct ptlrpc_request *req, int *status) | |
1059 | { | |
1060 | int delay = 0; | |
d7e09d03 | 1061 | |
3949015e | 1062 | LASSERT(status != NULL); |
d7e09d03 PT |
1063 | *status = 0; |
1064 | ||
1065 | if (req->rq_ctx_init || req->rq_ctx_fini) { | |
1066 | /* always allow ctx init/fini rpc go through */ | |
1067 | } else if (imp->imp_state == LUSTRE_IMP_NEW) { | |
1068 | DEBUG_REQ(D_ERROR, req, "Uninitialized import."); | |
1069 | *status = -EIO; | |
1070 | } else if (imp->imp_state == LUSTRE_IMP_CLOSED) { | |
1071 | /* pings may safely race with umount */ | |
1072 | DEBUG_REQ(lustre_msg_get_opc(req->rq_reqmsg) == OBD_PING ? | |
1073 | D_HA : D_ERROR, req, "IMP_CLOSED "); | |
1074 | *status = -EIO; | |
1075 | } else if (ptlrpc_send_limit_expired(req)) { | |
1076 | /* probably doesn't need to be a D_ERROR after initial testing */ | |
1077 | DEBUG_REQ(D_ERROR, req, "send limit expired "); | |
1078 | *status = -EIO; | |
1079 | } else if (req->rq_send_state == LUSTRE_IMP_CONNECTING && | |
1080 | imp->imp_state == LUSTRE_IMP_CONNECTING) { | |
7b8633de | 1081 | /* allow CONNECT even if import is invalid */ |
d7e09d03 PT |
1082 | if (atomic_read(&imp->imp_inval_count) != 0) { |
1083 | DEBUG_REQ(D_ERROR, req, "invalidate in flight"); | |
1084 | *status = -EIO; | |
1085 | } | |
1086 | } else if (imp->imp_invalid || imp->imp_obd->obd_no_recov) { | |
1087 | if (!imp->imp_deactive) | |
1088 | DEBUG_REQ(D_NET, req, "IMP_INVALID"); | |
1089 | *status = -ESHUTDOWN; /* bz 12940 */ | |
1090 | } else if (req->rq_import_generation != imp->imp_generation) { | |
1091 | DEBUG_REQ(D_ERROR, req, "req wrong generation:"); | |
1092 | *status = -EIO; | |
1093 | } else if (req->rq_send_state != imp->imp_state) { | |
1094 | /* invalidate in progress - any requests should be drop */ | |
1095 | if (atomic_read(&imp->imp_inval_count) != 0) { | |
1096 | DEBUG_REQ(D_ERROR, req, "invalidate in flight"); | |
1097 | *status = -EIO; | |
1098 | } else if (imp->imp_dlm_fake || req->rq_no_delay) { | |
1099 | *status = -EWOULDBLOCK; | |
1100 | } else if (req->rq_allow_replay && | |
1101 | (imp->imp_state == LUSTRE_IMP_REPLAY || | |
1102 | imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS || | |
1103 | imp->imp_state == LUSTRE_IMP_REPLAY_WAIT || | |
1104 | imp->imp_state == LUSTRE_IMP_RECOVER)) { | |
1105 | DEBUG_REQ(D_HA, req, "allow during recovery.\n"); | |
1106 | } else { | |
1107 | delay = 1; | |
1108 | } | |
1109 | } | |
1110 | ||
0a3bdb00 | 1111 | return delay; |
d7e09d03 PT |
1112 | } |
1113 | ||
1114 | /** | |
930cef9a | 1115 | * Decide if the error message regarding provided request \a req |
d7e09d03 PT |
1116 | * should be printed to the console or not. |
1117 | * Makes it's decision on request status and other properties. | |
1118 | * Returns 1 to print error on the system console or 0 if not. | |
1119 | */ | |
1120 | static int ptlrpc_console_allow(struct ptlrpc_request *req) | |
1121 | { | |
1122 | __u32 opc; | |
1123 | int err; | |
1124 | ||
1125 | LASSERT(req->rq_reqmsg != NULL); | |
1126 | opc = lustre_msg_get_opc(req->rq_reqmsg); | |
1127 | ||
1128 | /* Suppress particular reconnect errors which are to be expected. No | |
1129 | * errors are suppressed for the initial connection on an import */ | |
1130 | if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) && | |
1131 | (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) { | |
1132 | ||
1133 | /* Suppress timed out reconnect requests */ | |
1134 | if (req->rq_timedout) | |
1135 | return 0; | |
1136 | ||
1137 | /* Suppress unavailable/again reconnect requests */ | |
1138 | err = lustre_msg_get_status(req->rq_repmsg); | |
1139 | if (err == -ENODEV || err == -EAGAIN) | |
1140 | return 0; | |
1141 | } | |
1142 | ||
1143 | return 1; | |
1144 | } | |
1145 | ||
1146 | /** | |
1147 | * Check request processing status. | |
1148 | * Returns the status. | |
1149 | */ | |
1150 | static int ptlrpc_check_status(struct ptlrpc_request *req) | |
1151 | { | |
1152 | int err; | |
d7e09d03 PT |
1153 | |
1154 | err = lustre_msg_get_status(req->rq_repmsg); | |
1155 | if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) { | |
1156 | struct obd_import *imp = req->rq_import; | |
1157 | __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); | |
1158 | if (ptlrpc_console_allow(req)) | |
2d00bd17 | 1159 | LCONSOLE_ERROR_MSG(0x011, "%s: Communicating with %s, operation %s failed with %d.\n", |
d7e09d03 PT |
1160 | imp->imp_obd->obd_name, |
1161 | libcfs_nid2str( | |
2d00bd17 | 1162 | imp->imp_connection->c_peer.nid), |
d7e09d03 | 1163 | ll_opcode2str(opc), err); |
0a3bdb00 | 1164 | return err < 0 ? err : -EINVAL; |
d7e09d03 PT |
1165 | } |
1166 | ||
1167 | if (err < 0) { | |
1168 | DEBUG_REQ(D_INFO, req, "status is %d", err); | |
1169 | } else if (err > 0) { | |
1170 | /* XXX: translate this error from net to host */ | |
1171 | DEBUG_REQ(D_INFO, req, "status is %d", err); | |
1172 | } | |
1173 | ||
0a3bdb00 | 1174 | return err; |
d7e09d03 PT |
1175 | } |
1176 | ||
1177 | /** | |
1178 | * save pre-versions of objects into request for replay. | |
1179 | * Versions are obtained from server reply. | |
1180 | * used for VBR. | |
1181 | */ | |
1182 | static void ptlrpc_save_versions(struct ptlrpc_request *req) | |
1183 | { | |
1184 | struct lustre_msg *repmsg = req->rq_repmsg; | |
1185 | struct lustre_msg *reqmsg = req->rq_reqmsg; | |
1186 | __u64 *versions = lustre_msg_get_versions(repmsg); | |
d7e09d03 PT |
1187 | |
1188 | if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) | |
1189 | return; | |
1190 | ||
1191 | LASSERT(versions); | |
1192 | lustre_msg_set_versions(reqmsg, versions); | |
55f5a824 | 1193 | CDEBUG(D_INFO, "Client save versions [%#llx/%#llx]\n", |
d7e09d03 | 1194 | versions[0], versions[1]); |
d7e09d03 PT |
1195 | } |
1196 | ||
1197 | /** | |
1198 | * Callback function called when client receives RPC reply for \a req. | |
1199 | * Returns 0 on success or error code. | |
930cef9a | 1200 | * The return value would be assigned to req->rq_status by the caller |
d7e09d03 PT |
1201 | * as request processing status. |
1202 | * This function also decides if the request needs to be saved for later replay. | |
1203 | */ | |
1204 | static int after_reply(struct ptlrpc_request *req) | |
1205 | { | |
1206 | struct obd_import *imp = req->rq_import; | |
1207 | struct obd_device *obd = req->rq_import->imp_obd; | |
1208 | int rc; | |
1209 | struct timeval work_start; | |
1210 | long timediff; | |
d7e09d03 PT |
1211 | |
1212 | LASSERT(obd != NULL); | |
1213 | /* repbuf must be unlinked */ | |
cf378ff7 | 1214 | LASSERT(!req->rq_receiving_reply && !req->rq_reply_unlink); |
d7e09d03 PT |
1215 | |
1216 | if (req->rq_reply_truncate) { | |
1217 | if (ptlrpc_no_resend(req)) { | |
2d00bd17 | 1218 | DEBUG_REQ(D_ERROR, req, "reply buffer overflow, expected: %d, actual size: %d", |
d7e09d03 | 1219 | req->rq_nob_received, req->rq_repbuf_len); |
0a3bdb00 | 1220 | return -EOVERFLOW; |
d7e09d03 PT |
1221 | } |
1222 | ||
1223 | sptlrpc_cli_free_repbuf(req); | |
1224 | /* Pass the required reply buffer size (include | |
1225 | * space for early reply). | |
1226 | * NB: no need to roundup because alloc_repbuf | |
1227 | * will roundup it */ | |
1228 | req->rq_replen = req->rq_nob_received; | |
1229 | req->rq_nob_received = 0; | |
15c50ccc | 1230 | spin_lock(&req->rq_lock); |
d7e09d03 | 1231 | req->rq_resend = 1; |
15c50ccc | 1232 | spin_unlock(&req->rq_lock); |
0a3bdb00 | 1233 | return 0; |
d7e09d03 PT |
1234 | } |
1235 | ||
1236 | /* | |
1237 | * NB Until this point, the whole of the incoming message, | |
1238 | * including buflens, status etc is in the sender's byte order. | |
1239 | */ | |
1240 | rc = sptlrpc_cli_unwrap_reply(req); | |
1241 | if (rc) { | |
1242 | DEBUG_REQ(D_ERROR, req, "unwrap reply failed (%d):", rc); | |
0a3bdb00 | 1243 | return rc; |
d7e09d03 PT |
1244 | } |
1245 | ||
1246 | /* | |
1247 | * Security layer unwrap might ask resend this request. | |
1248 | */ | |
1249 | if (req->rq_resend) | |
0a3bdb00 | 1250 | return 0; |
d7e09d03 PT |
1251 | |
1252 | rc = unpack_reply(req); | |
1253 | if (rc) | |
0a3bdb00 | 1254 | return rc; |
d7e09d03 PT |
1255 | |
1256 | /* retry indefinitely on EINPROGRESS */ | |
1257 | if (lustre_msg_get_status(req->rq_repmsg) == -EINPROGRESS && | |
1258 | ptlrpc_no_resend(req) == 0 && !req->rq_no_retry_einprogress) { | |
7264b8a5 | 1259 | time_t now = get_seconds(); |
d7e09d03 PT |
1260 | |
1261 | DEBUG_REQ(D_RPCTRACE, req, "Resending request on EINPROGRESS"); | |
d27f9b07 | 1262 | spin_lock(&req->rq_lock); |
d7e09d03 | 1263 | req->rq_resend = 1; |
d27f9b07 | 1264 | spin_unlock(&req->rq_lock); |
d7e09d03 PT |
1265 | req->rq_nr_resend++; |
1266 | ||
1267 | /* allocate new xid to avoid reply reconstruction */ | |
1268 | if (!req->rq_bulk) { | |
1269 | /* new xid is already allocated for bulk in | |
1270 | * ptlrpc_check_set() */ | |
1271 | req->rq_xid = ptlrpc_next_xid(); | |
2d00bd17 | 1272 | DEBUG_REQ(D_RPCTRACE, req, "Allocating new xid for resend on EINPROGRESS"); |
d7e09d03 PT |
1273 | } |
1274 | ||
1275 | /* Readjust the timeout for current conditions */ | |
1276 | ptlrpc_at_set_req_timeout(req); | |
1277 | /* delay resend to give a chance to the server to get ready. | |
1278 | * The delay is increased by 1s on every resend and is capped to | |
1279 | * the current request timeout (i.e. obd_timeout if AT is off, | |
1280 | * or AT service time x 125% + 5s, see at_est2timeout) */ | |
1281 | if (req->rq_nr_resend > req->rq_timeout) | |
1282 | req->rq_sent = now + req->rq_timeout; | |
1283 | else | |
1284 | req->rq_sent = now + req->rq_nr_resend; | |
1285 | ||
0a3bdb00 | 1286 | return 0; |
d7e09d03 PT |
1287 | } |
1288 | ||
1289 | do_gettimeofday(&work_start); | |
1290 | timediff = cfs_timeval_sub(&work_start, &req->rq_arrival_time, NULL); | |
1291 | if (obd->obd_svc_stats != NULL) { | |
1292 | lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR, | |
1293 | timediff); | |
1294 | ptlrpc_lprocfs_rpc_sent(req, timediff); | |
1295 | } | |
1296 | ||
1297 | if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY && | |
1298 | lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) { | |
1299 | DEBUG_REQ(D_ERROR, req, "invalid packet received (type=%u)", | |
1300 | lustre_msg_get_type(req->rq_repmsg)); | |
0a3bdb00 | 1301 | return -EPROTO; |
d7e09d03 PT |
1302 | } |
1303 | ||
1304 | if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING) | |
1305 | CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, cfs_fail_val); | |
1306 | ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg)); | |
1307 | ptlrpc_at_adj_net_latency(req, | |
1308 | lustre_msg_get_service_time(req->rq_repmsg)); | |
1309 | ||
1310 | rc = ptlrpc_check_status(req); | |
1311 | imp->imp_connect_error = rc; | |
1312 | ||
1313 | if (rc) { | |
1314 | /* | |
1315 | * Either we've been evicted, or the server has failed for | |
1316 | * some reason. Try to reconnect, and if that fails, punt to | |
1317 | * the upcall. | |
1318 | */ | |
1319 | if (ll_rpc_recoverable_error(rc)) { | |
1320 | if (req->rq_send_state != LUSTRE_IMP_FULL || | |
1321 | imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) { | |
0a3bdb00 | 1322 | return rc; |
d7e09d03 PT |
1323 | } |
1324 | ptlrpc_request_handle_notconn(req); | |
0a3bdb00 | 1325 | return rc; |
d7e09d03 PT |
1326 | } |
1327 | } else { | |
1328 | /* | |
1329 | * Let's look if server sent slv. Do it only for RPC with | |
1330 | * rc == 0. | |
1331 | */ | |
1332 | ldlm_cli_update_pool(req); | |
1333 | } | |
1334 | ||
1335 | /* | |
1336 | * Store transno in reqmsg for replay. | |
1337 | */ | |
1338 | if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) { | |
1339 | req->rq_transno = lustre_msg_get_transno(req->rq_repmsg); | |
1340 | lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno); | |
1341 | } | |
1342 | ||
1343 | if (imp->imp_replayable) { | |
1344 | spin_lock(&imp->imp_lock); | |
1345 | /* | |
1346 | * No point in adding already-committed requests to the replay | |
1347 | * list, we will just remove them immediately. b=9829 | |
1348 | */ | |
1349 | if (req->rq_transno != 0 && | |
1350 | (req->rq_transno > | |
1351 | lustre_msg_get_last_committed(req->rq_repmsg) || | |
1352 | req->rq_replay)) { | |
1353 | /** version recovery */ | |
1354 | ptlrpc_save_versions(req); | |
1355 | ptlrpc_retain_replayable_request(req, imp); | |
503a1ac7 LZ |
1356 | } else if (req->rq_commit_cb != NULL && |
1357 | list_empty(&req->rq_replay_list)) { | |
1358 | /* NB: don't call rq_commit_cb if it's already on | |
1359 | * rq_replay_list, ptlrpc_free_committed() will call | |
1360 | * it later, see LU-3618 for details */ | |
d7e09d03 PT |
1361 | spin_unlock(&imp->imp_lock); |
1362 | req->rq_commit_cb(req); | |
1363 | spin_lock(&imp->imp_lock); | |
1364 | } | |
1365 | ||
1366 | /* | |
1367 | * Replay-enabled imports return commit-status information. | |
1368 | */ | |
1369 | if (lustre_msg_get_last_committed(req->rq_repmsg)) { | |
1370 | imp->imp_peer_committed_transno = | |
1371 | lustre_msg_get_last_committed(req->rq_repmsg); | |
1372 | } | |
1373 | ||
1374 | ptlrpc_free_committed(imp); | |
1375 | ||
1376 | if (!list_empty(&imp->imp_replay_list)) { | |
1377 | struct ptlrpc_request *last; | |
1378 | ||
1379 | last = list_entry(imp->imp_replay_list.prev, | |
1380 | struct ptlrpc_request, | |
1381 | rq_replay_list); | |
1382 | /* | |
1383 | * Requests with rq_replay stay on the list even if no | |
1384 | * commit is expected. | |
1385 | */ | |
1386 | if (last->rq_transno > imp->imp_peer_committed_transno) | |
1387 | ptlrpc_pinger_commit_expected(imp); | |
1388 | } | |
1389 | ||
1390 | spin_unlock(&imp->imp_lock); | |
1391 | } | |
1392 | ||
0a3bdb00 | 1393 | return rc; |
d7e09d03 PT |
1394 | } |
1395 | ||
1396 | /** | |
1397 | * Helper function to send request \a req over the network for the first time | |
1398 | * Also adjusts request phase. | |
1399 | * Returns 0 on success or error code. | |
1400 | */ | |
1401 | static int ptlrpc_send_new_req(struct ptlrpc_request *req) | |
1402 | { | |
d0bfef31 | 1403 | struct obd_import *imp = req->rq_import; |
d7e09d03 | 1404 | int rc; |
d7e09d03 PT |
1405 | |
1406 | LASSERT(req->rq_phase == RQ_PHASE_NEW); | |
7264b8a5 | 1407 | if (req->rq_sent && (req->rq_sent > get_seconds()) && |
d7e09d03 PT |
1408 | (!req->rq_generation_set || |
1409 | req->rq_import_generation == imp->imp_generation)) | |
0a3bdb00 | 1410 | return 0; |
d7e09d03 PT |
1411 | |
1412 | ptlrpc_rqphase_move(req, RQ_PHASE_RPC); | |
1413 | ||
1414 | spin_lock(&imp->imp_lock); | |
1415 | ||
1416 | if (!req->rq_generation_set) | |
1417 | req->rq_import_generation = imp->imp_generation; | |
1418 | ||
1419 | if (ptlrpc_import_delay_req(imp, req, &rc)) { | |
1420 | spin_lock(&req->rq_lock); | |
1421 | req->rq_waiting = 1; | |
1422 | spin_unlock(&req->rq_lock); | |
1423 | ||
2d00bd17 JP |
1424 | DEBUG_REQ(D_HA, req, "req from PID %d waiting for recovery: (%s != %s)", |
1425 | lustre_msg_get_status(req->rq_reqmsg), | |
d7e09d03 PT |
1426 | ptlrpc_import_state_name(req->rq_send_state), |
1427 | ptlrpc_import_state_name(imp->imp_state)); | |
1428 | LASSERT(list_empty(&req->rq_list)); | |
1429 | list_add_tail(&req->rq_list, &imp->imp_delayed_list); | |
1430 | atomic_inc(&req->rq_import->imp_inflight); | |
1431 | spin_unlock(&imp->imp_lock); | |
0a3bdb00 | 1432 | return 0; |
d7e09d03 PT |
1433 | } |
1434 | ||
1435 | if (rc != 0) { | |
1436 | spin_unlock(&imp->imp_lock); | |
1437 | req->rq_status = rc; | |
1438 | ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); | |
0a3bdb00 | 1439 | return rc; |
d7e09d03 PT |
1440 | } |
1441 | ||
1442 | LASSERT(list_empty(&req->rq_list)); | |
1443 | list_add_tail(&req->rq_list, &imp->imp_sending_list); | |
1444 | atomic_inc(&req->rq_import->imp_inflight); | |
1445 | spin_unlock(&imp->imp_lock); | |
1446 | ||
1447 | lustre_msg_set_status(req->rq_reqmsg, current_pid()); | |
1448 | ||
1449 | rc = sptlrpc_req_refresh_ctx(req, -1); | |
1450 | if (rc) { | |
1451 | if (req->rq_err) { | |
1452 | req->rq_status = rc; | |
0a3bdb00 | 1453 | return 1; |
d7e09d03 | 1454 | } |
5b9359f1 MY |
1455 | spin_lock(&req->rq_lock); |
1456 | req->rq_wait_ctx = 1; | |
1457 | spin_unlock(&req->rq_lock); | |
1458 | return 0; | |
d7e09d03 PT |
1459 | } |
1460 | ||
2d00bd17 JP |
1461 | CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:nid:opc %s:%s:%d:%llu:%s:%d\n", |
1462 | current_comm(), | |
d7e09d03 PT |
1463 | imp->imp_obd->obd_uuid.uuid, |
1464 | lustre_msg_get_status(req->rq_reqmsg), req->rq_xid, | |
1465 | libcfs_nid2str(imp->imp_connection->c_peer.nid), | |
1466 | lustre_msg_get_opc(req->rq_reqmsg)); | |
1467 | ||
1468 | rc = ptl_send_rpc(req, 0); | |
1469 | if (rc) { | |
1470 | DEBUG_REQ(D_HA, req, "send failed (%d); expect timeout", rc); | |
15c50ccc | 1471 | spin_lock(&req->rq_lock); |
d7e09d03 | 1472 | req->rq_net_err = 1; |
15c50ccc | 1473 | spin_unlock(&req->rq_lock); |
0a3bdb00 | 1474 | return rc; |
d7e09d03 | 1475 | } |
0a3bdb00 | 1476 | return 0; |
d7e09d03 PT |
1477 | } |
1478 | ||
1479 | static inline int ptlrpc_set_producer(struct ptlrpc_request_set *set) | |
1480 | { | |
1481 | int remaining, rc; | |
d7e09d03 PT |
1482 | |
1483 | LASSERT(set->set_producer != NULL); | |
1484 | ||
1485 | remaining = atomic_read(&set->set_remaining); | |
1486 | ||
1487 | /* populate the ->set_requests list with requests until we | |
1488 | * reach the maximum number of RPCs in flight for this set */ | |
1489 | while (atomic_read(&set->set_remaining) < set->set_max_inflight) { | |
1490 | rc = set->set_producer(set, set->set_producer_arg); | |
1491 | if (rc == -ENOENT) { | |
1492 | /* no more RPC to produce */ | |
1493 | set->set_producer = NULL; | |
1494 | set->set_producer_arg = NULL; | |
0a3bdb00 | 1495 | return 0; |
d7e09d03 PT |
1496 | } |
1497 | } | |
1498 | ||
0a3bdb00 | 1499 | return (atomic_read(&set->set_remaining) - remaining); |
d7e09d03 PT |
1500 | } |
1501 | ||
1502 | /** | |
1503 | * this sends any unsent RPCs in \a set and returns 1 if all are sent | |
1504 | * and no more replies are expected. | |
1505 | * (it is possible to get less replies than requests sent e.g. due to timed out | |
1506 | * requests or requests that we had trouble to send out) | |
da9e33c9 CM |
1507 | * |
1508 | * NOTE: This function contains a potential schedule point (cond_resched()). | |
d7e09d03 PT |
1509 | */ |
1510 | int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) | |
1511 | { | |
1512 | struct list_head *tmp, *next; | |
fa55c6a4 | 1513 | struct list_head comp_reqs; |
d7e09d03 | 1514 | int force_timer_recalc = 0; |
d7e09d03 PT |
1515 | |
1516 | if (atomic_read(&set->set_remaining) == 0) | |
0a3bdb00 | 1517 | return 1; |
d7e09d03 | 1518 | |
fa55c6a4 | 1519 | INIT_LIST_HEAD(&comp_reqs); |
d7e09d03 PT |
1520 | list_for_each_safe(tmp, next, &set->set_requests) { |
1521 | struct ptlrpc_request *req = | |
1522 | list_entry(tmp, struct ptlrpc_request, | |
1523 | rq_set_chain); | |
1524 | struct obd_import *imp = req->rq_import; | |
1525 | int unregistered = 0; | |
1526 | int rc = 0; | |
1527 | ||
da9e33c9 CM |
1528 | /* This schedule point is mainly for the ptlrpcd caller of this |
1529 | * function. Most ptlrpc sets are not long-lived and unbounded | |
1530 | * in length, but at the least the set used by the ptlrpcd is. | |
1531 | * Since the processing time is unbounded, we need to insert an | |
1532 | * explicit schedule point to make the thread well-behaved. | |
1533 | */ | |
1534 | cond_resched(); | |
1535 | ||
d7e09d03 PT |
1536 | if (req->rq_phase == RQ_PHASE_NEW && |
1537 | ptlrpc_send_new_req(req)) { | |
1538 | force_timer_recalc = 1; | |
1539 | } | |
1540 | ||
1541 | /* delayed send - skip */ | |
1542 | if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent) | |
1543 | continue; | |
1544 | ||
1545 | /* delayed resend - skip */ | |
1546 | if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend && | |
7264b8a5 | 1547 | req->rq_sent > get_seconds()) |
d7e09d03 PT |
1548 | continue; |
1549 | ||
1550 | if (!(req->rq_phase == RQ_PHASE_RPC || | |
1551 | req->rq_phase == RQ_PHASE_BULK || | |
1552 | req->rq_phase == RQ_PHASE_INTERPRET || | |
1553 | req->rq_phase == RQ_PHASE_UNREGISTERING || | |
1554 | req->rq_phase == RQ_PHASE_COMPLETE)) { | |
1555 | DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase); | |
1556 | LBUG(); | |
1557 | } | |
1558 | ||
1559 | if (req->rq_phase == RQ_PHASE_UNREGISTERING) { | |
1560 | LASSERT(req->rq_next_phase != req->rq_phase); | |
1561 | LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED); | |
1562 | ||
1563 | /* | |
1564 | * Skip processing until reply is unlinked. We | |
1565 | * can't return to pool before that and we can't | |
1566 | * call interpret before that. We need to make | |
1567 | * sure that all rdma transfers finished and will | |
1568 | * not corrupt any data. | |
1569 | */ | |
1570 | if (ptlrpc_client_recv_or_unlink(req) || | |
1571 | ptlrpc_client_bulk_active(req)) | |
1572 | continue; | |
1573 | ||
1574 | /* | |
1575 | * Turn fail_loc off to prevent it from looping | |
1576 | * forever. | |
1577 | */ | |
1578 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) { | |
1579 | OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK, | |
1580 | OBD_FAIL_ONCE); | |
1581 | } | |
1582 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) { | |
1583 | OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK, | |
1584 | OBD_FAIL_ONCE); | |
1585 | } | |
1586 | ||
1587 | /* | |
1588 | * Move to next phase if reply was successfully | |
1589 | * unlinked. | |
1590 | */ | |
1591 | ptlrpc_rqphase_move(req, req->rq_next_phase); | |
1592 | } | |
1593 | ||
fa55c6a4 LZ |
1594 | if (req->rq_phase == RQ_PHASE_COMPLETE) { |
1595 | list_move_tail(&req->rq_set_chain, &comp_reqs); | |
d7e09d03 | 1596 | continue; |
fa55c6a4 | 1597 | } |
d7e09d03 PT |
1598 | |
1599 | if (req->rq_phase == RQ_PHASE_INTERPRET) | |
a9b3e8f3 | 1600 | goto interpret; |
d7e09d03 PT |
1601 | |
1602 | /* | |
1603 | * Note that this also will start async reply unlink. | |
1604 | */ | |
1605 | if (req->rq_net_err && !req->rq_timedout) { | |
1606 | ptlrpc_expire_one_request(req, 1); | |
1607 | ||
1608 | /* | |
1609 | * Check if we still need to wait for unlink. | |
1610 | */ | |
1611 | if (ptlrpc_client_recv_or_unlink(req) || | |
1612 | ptlrpc_client_bulk_active(req)) | |
1613 | continue; | |
1614 | /* If there is no need to resend, fail it now. */ | |
1615 | if (req->rq_no_resend) { | |
1616 | if (req->rq_status == 0) | |
1617 | req->rq_status = -EIO; | |
1618 | ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); | |
a9b3e8f3 | 1619 | goto interpret; |
d7e09d03 PT |
1620 | } else { |
1621 | continue; | |
1622 | } | |
1623 | } | |
1624 | ||
1625 | if (req->rq_err) { | |
1626 | spin_lock(&req->rq_lock); | |
1627 | req->rq_replied = 0; | |
1628 | spin_unlock(&req->rq_lock); | |
1629 | if (req->rq_status == 0) | |
1630 | req->rq_status = -EIO; | |
1631 | ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); | |
a9b3e8f3 | 1632 | goto interpret; |
d7e09d03 PT |
1633 | } |
1634 | ||
1635 | /* ptlrpc_set_wait->l_wait_event sets lwi_allow_intr | |
1636 | * so it sets rq_intr regardless of individual rpc | |
1637 | * timeouts. The synchronous IO waiting path sets | |
1638 | * rq_intr irrespective of whether ptlrpcd | |
1639 | * has seen a timeout. Our policy is to only interpret | |
1640 | * interrupted rpcs after they have timed out, so we | |
1641 | * need to enforce that here. | |
1642 | */ | |
1643 | ||
1644 | if (req->rq_intr && (req->rq_timedout || req->rq_waiting || | |
1645 | req->rq_wait_ctx)) { | |
1646 | req->rq_status = -EINTR; | |
1647 | ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); | |
a9b3e8f3 | 1648 | goto interpret; |
d7e09d03 PT |
1649 | } |
1650 | ||
1651 | if (req->rq_phase == RQ_PHASE_RPC) { | |
1652 | if (req->rq_timedout || req->rq_resend || | |
1653 | req->rq_waiting || req->rq_wait_ctx) { | |
1654 | int status; | |
1655 | ||
1656 | if (!ptlrpc_unregister_reply(req, 1)) | |
1657 | continue; | |
1658 | ||
1659 | spin_lock(&imp->imp_lock); | |
cb68dd2d KM |
1660 | if (ptlrpc_import_delay_req(imp, req, |
1661 | &status)) { | |
d7e09d03 PT |
1662 | /* put on delay list - only if we wait |
1663 | * recovery finished - before send */ | |
1664 | list_del_init(&req->rq_list); | |
1665 | list_add_tail(&req->rq_list, | |
1666 | &imp-> | |
1667 | imp_delayed_list); | |
1668 | spin_unlock(&imp->imp_lock); | |
1669 | continue; | |
1670 | } | |
1671 | ||
d0bfef31 | 1672 | if (status != 0) { |
d7e09d03 PT |
1673 | req->rq_status = status; |
1674 | ptlrpc_rqphase_move(req, | |
1675 | RQ_PHASE_INTERPRET); | |
1676 | spin_unlock(&imp->imp_lock); | |
a9b3e8f3 | 1677 | goto interpret; |
d7e09d03 PT |
1678 | } |
1679 | if (ptlrpc_no_resend(req) && | |
1680 | !req->rq_wait_ctx) { | |
1681 | req->rq_status = -ENOTCONN; | |
1682 | ptlrpc_rqphase_move(req, | |
1683 | RQ_PHASE_INTERPRET); | |
1684 | spin_unlock(&imp->imp_lock); | |
a9b3e8f3 | 1685 | goto interpret; |
d7e09d03 PT |
1686 | } |
1687 | ||
1688 | list_del_init(&req->rq_list); | |
1689 | list_add_tail(&req->rq_list, | |
1690 | &imp->imp_sending_list); | |
1691 | ||
1692 | spin_unlock(&imp->imp_lock); | |
1693 | ||
1694 | spin_lock(&req->rq_lock); | |
1695 | req->rq_waiting = 0; | |
1696 | spin_unlock(&req->rq_lock); | |
1697 | ||
1698 | if (req->rq_timedout || req->rq_resend) { | |
1699 | /* This is re-sending anyways, | |
1700 | * let's mark req as resend. */ | |
1701 | spin_lock(&req->rq_lock); | |
1702 | req->rq_resend = 1; | |
1703 | spin_unlock(&req->rq_lock); | |
1704 | if (req->rq_bulk) { | |
1705 | __u64 old_xid; | |
1706 | ||
1707 | if (!ptlrpc_unregister_bulk(req, 1)) | |
1708 | continue; | |
1709 | ||
1710 | /* ensure previous bulk fails */ | |
1711 | old_xid = req->rq_xid; | |
1712 | req->rq_xid = ptlrpc_next_xid(); | |
b0f5aad5 | 1713 | CDEBUG(D_HA, "resend bulk old x%llu new x%llu\n", |
d7e09d03 PT |
1714 | old_xid, req->rq_xid); |
1715 | } | |
1716 | } | |
1717 | /* | |
1718 | * rq_wait_ctx is only touched by ptlrpcd, | |
1719 | * so no lock is needed here. | |
1720 | */ | |
1721 | status = sptlrpc_req_refresh_ctx(req, -1); | |
1722 | if (status) { | |
1723 | if (req->rq_err) { | |
1724 | req->rq_status = status; | |
1725 | spin_lock(&req->rq_lock); | |
1726 | req->rq_wait_ctx = 0; | |
1727 | spin_unlock(&req->rq_lock); | |
1728 | force_timer_recalc = 1; | |
1729 | } else { | |
1730 | spin_lock(&req->rq_lock); | |
1731 | req->rq_wait_ctx = 1; | |
1732 | spin_unlock(&req->rq_lock); | |
1733 | } | |
1734 | ||
1735 | continue; | |
1736 | } else { | |
1737 | spin_lock(&req->rq_lock); | |
1738 | req->rq_wait_ctx = 0; | |
1739 | spin_unlock(&req->rq_lock); | |
1740 | } | |
1741 | ||
1742 | rc = ptl_send_rpc(req, 0); | |
1743 | if (rc) { | |
1744 | DEBUG_REQ(D_HA, req, | |
1745 | "send failed: rc = %d", rc); | |
1746 | force_timer_recalc = 1; | |
1747 | spin_lock(&req->rq_lock); | |
1748 | req->rq_net_err = 1; | |
1749 | spin_unlock(&req->rq_lock); | |
e3bceb23 | 1750 | continue; |
d7e09d03 PT |
1751 | } |
1752 | /* need to reset the timeout */ | |
1753 | force_timer_recalc = 1; | |
1754 | } | |
1755 | ||
1756 | spin_lock(&req->rq_lock); | |
1757 | ||
1758 | if (ptlrpc_client_early(req)) { | |
1759 | ptlrpc_at_recv_early_reply(req); | |
1760 | spin_unlock(&req->rq_lock); | |
1761 | continue; | |
1762 | } | |
1763 | ||
1764 | /* Still waiting for a reply? */ | |
1765 | if (ptlrpc_client_recv(req)) { | |
1766 | spin_unlock(&req->rq_lock); | |
1767 | continue; | |
1768 | } | |
1769 | ||
1770 | /* Did we actually receive a reply? */ | |
1771 | if (!ptlrpc_client_replied(req)) { | |
1772 | spin_unlock(&req->rq_lock); | |
1773 | continue; | |
1774 | } | |
1775 | ||
1776 | spin_unlock(&req->rq_lock); | |
1777 | ||
1778 | /* unlink from net because we are going to | |
1779 | * swab in-place of reply buffer */ | |
1780 | unregistered = ptlrpc_unregister_reply(req, 1); | |
1781 | if (!unregistered) | |
1782 | continue; | |
1783 | ||
1784 | req->rq_status = after_reply(req); | |
1785 | if (req->rq_resend) | |
1786 | continue; | |
1787 | ||
1788 | /* If there is no bulk associated with this request, | |
1789 | * then we're done and should let the interpreter | |
1790 | * process the reply. Similarly if the RPC returned | |
1791 | * an error, and therefore the bulk will never arrive. | |
1792 | */ | |
1793 | if (req->rq_bulk == NULL || req->rq_status < 0) { | |
1794 | ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); | |
a9b3e8f3 | 1795 | goto interpret; |
d7e09d03 PT |
1796 | } |
1797 | ||
1798 | ptlrpc_rqphase_move(req, RQ_PHASE_BULK); | |
1799 | } | |
1800 | ||
1801 | LASSERT(req->rq_phase == RQ_PHASE_BULK); | |
1802 | if (ptlrpc_client_bulk_active(req)) | |
1803 | continue; | |
1804 | ||
1805 | if (req->rq_bulk->bd_failure) { | |
1806 | /* The RPC reply arrived OK, but the bulk screwed | |
1807 | * up! Dead weird since the server told us the RPC | |
1808 | * was good after getting the REPLY for her GET or | |
1809 | * the ACK for her PUT. */ | |
1810 | DEBUG_REQ(D_ERROR, req, "bulk transfer failed"); | |
1811 | req->rq_status = -EIO; | |
1812 | } | |
1813 | ||
1814 | ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); | |
1815 | ||
7f1d15a8 | 1816 | interpret: |
d7e09d03 PT |
1817 | LASSERT(req->rq_phase == RQ_PHASE_INTERPRET); |
1818 | ||
1819 | /* This moves to "unregistering" phase we need to wait for | |
1820 | * reply unlink. */ | |
1821 | if (!unregistered && !ptlrpc_unregister_reply(req, 1)) { | |
1822 | /* start async bulk unlink too */ | |
1823 | ptlrpc_unregister_bulk(req, 1); | |
1824 | continue; | |
1825 | } | |
1826 | ||
1827 | if (!ptlrpc_unregister_bulk(req, 1)) | |
1828 | continue; | |
1829 | ||
1830 | /* When calling interpret receiving already should be | |
1831 | * finished. */ | |
1832 | LASSERT(!req->rq_receiving_reply); | |
1833 | ||
1834 | ptlrpc_req_interpret(env, req, req->rq_status); | |
1835 | ||
82a373ae LZ |
1836 | if (ptlrpcd_check_work(req)) { |
1837 | atomic_dec(&set->set_remaining); | |
1838 | continue; | |
1839 | } | |
d7e09d03 PT |
1840 | ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE); |
1841 | ||
1842 | CDEBUG(req->rq_reqmsg != NULL ? D_RPCTRACE : 0, | |
2d00bd17 JP |
1843 | "Completed RPC pname:cluuid:pid:xid:nid:opc %s:%s:%d:%llu:%s:%d\n", |
1844 | current_comm(), imp->imp_obd->obd_uuid.uuid, | |
1845 | lustre_msg_get_status(req->rq_reqmsg), req->rq_xid, | |
1846 | libcfs_nid2str(imp->imp_connection->c_peer.nid), | |
1847 | lustre_msg_get_opc(req->rq_reqmsg)); | |
d7e09d03 PT |
1848 | |
1849 | spin_lock(&imp->imp_lock); | |
1850 | /* Request already may be not on sending or delaying list. This | |
1851 | * may happen in the case of marking it erroneous for the case | |
1852 | * ptlrpc_import_delay_req(req, status) find it impossible to | |
1853 | * allow sending this rpc and returns *status != 0. */ | |
1854 | if (!list_empty(&req->rq_list)) { | |
1855 | list_del_init(&req->rq_list); | |
1856 | atomic_dec(&imp->imp_inflight); | |
1857 | } | |
1858 | spin_unlock(&imp->imp_lock); | |
1859 | ||
1860 | atomic_dec(&set->set_remaining); | |
1861 | wake_up_all(&imp->imp_recovery_waitq); | |
1862 | ||
1863 | if (set->set_producer) { | |
1864 | /* produce a new request if possible */ | |
1865 | if (ptlrpc_set_producer(set) > 0) | |
1866 | force_timer_recalc = 1; | |
1867 | ||
1868 | /* free the request that has just been completed | |
1869 | * in order not to pollute set->set_requests */ | |
1870 | list_del_init(&req->rq_set_chain); | |
1871 | spin_lock(&req->rq_lock); | |
1872 | req->rq_set = NULL; | |
1873 | req->rq_invalid_rqset = 0; | |
1874 | spin_unlock(&req->rq_lock); | |
1875 | ||
1876 | /* record rq_status to compute the final status later */ | |
1877 | if (req->rq_status != 0) | |
1878 | set->set_rc = req->rq_status; | |
1879 | ptlrpc_req_finished(req); | |
fa55c6a4 LZ |
1880 | } else { |
1881 | list_move_tail(&req->rq_set_chain, &comp_reqs); | |
d7e09d03 PT |
1882 | } |
1883 | } | |
1884 | ||
fa55c6a4 LZ |
1885 | /* move completed request at the head of list so it's easier for |
1886 | * caller to find them */ | |
1887 | list_splice(&comp_reqs, &set->set_requests); | |
1888 | ||
d7e09d03 | 1889 | /* If we hit an error, we want to recover promptly. */ |
0a3bdb00 | 1890 | return atomic_read(&set->set_remaining) == 0 || force_timer_recalc; |
d7e09d03 PT |
1891 | } |
1892 | EXPORT_SYMBOL(ptlrpc_check_set); | |
1893 | ||
1894 | /** | |
1895 | * Time out request \a req. is \a async_unlink is set, that means do not wait | |
1896 | * until LNet actually confirms network buffer unlinking. | |
1897 | * Return 1 if we should give up further retrying attempts or 0 otherwise. | |
1898 | */ | |
1899 | int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink) | |
1900 | { | |
1901 | struct obd_import *imp = req->rq_import; | |
1902 | int rc = 0; | |
d7e09d03 PT |
1903 | |
1904 | spin_lock(&req->rq_lock); | |
1905 | req->rq_timedout = 1; | |
1906 | spin_unlock(&req->rq_lock); | |
1907 | ||
1908 | DEBUG_REQ(D_WARNING, req, "Request sent has %s: [sent "CFS_DURATION_T | |
1909 | "/real "CFS_DURATION_T"]", | |
1910 | req->rq_net_err ? "failed due to network error" : | |
1911 | ((req->rq_real_sent == 0 || | |
699503bc | 1912 | time_before((unsigned long)req->rq_real_sent, (unsigned long)req->rq_sent) || |
d7e09d03 PT |
1913 | cfs_time_aftereq(req->rq_real_sent, req->rq_deadline)) ? |
1914 | "timed out for sent delay" : "timed out for slow reply"), | |
1915 | req->rq_sent, req->rq_real_sent); | |
1916 | ||
1917 | if (imp != NULL && obd_debug_peer_on_timeout) | |
1918 | LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer); | |
1919 | ||
1920 | ptlrpc_unregister_reply(req, async_unlink); | |
1921 | ptlrpc_unregister_bulk(req, async_unlink); | |
1922 | ||
1923 | if (obd_dump_on_timeout) | |
1924 | libcfs_debug_dumplog(); | |
1925 | ||
1926 | if (imp == NULL) { | |
1927 | DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?"); | |
0a3bdb00 | 1928 | return 1; |
d7e09d03 PT |
1929 | } |
1930 | ||
1931 | atomic_inc(&imp->imp_timeouts); | |
1932 | ||
1933 | /* The DLM server doesn't want recovery run on its imports. */ | |
1934 | if (imp->imp_dlm_fake) | |
0a3bdb00 | 1935 | return 1; |
d7e09d03 PT |
1936 | |
1937 | /* If this request is for recovery or other primordial tasks, | |
1938 | * then error it out here. */ | |
1939 | if (req->rq_ctx_init || req->rq_ctx_fini || | |
1940 | req->rq_send_state != LUSTRE_IMP_FULL || | |
1941 | imp->imp_obd->obd_no_recov) { | |
1942 | DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)", | |
1943 | ptlrpc_import_state_name(req->rq_send_state), | |
1944 | ptlrpc_import_state_name(imp->imp_state)); | |
1945 | spin_lock(&req->rq_lock); | |
1946 | req->rq_status = -ETIMEDOUT; | |
1947 | req->rq_err = 1; | |
1948 | spin_unlock(&req->rq_lock); | |
0a3bdb00 | 1949 | return 1; |
d7e09d03 PT |
1950 | } |
1951 | ||
1952 | /* if a request can't be resent we can't wait for an answer after | |
1953 | the timeout */ | |
1954 | if (ptlrpc_no_resend(req)) { | |
1955 | DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:"); | |
1956 | rc = 1; | |
1957 | } | |
1958 | ||
1959 | ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg)); | |
1960 | ||
0a3bdb00 | 1961 | return rc; |
d7e09d03 PT |
1962 | } |
1963 | ||
1964 | /** | |
1965 | * Time out all uncompleted requests in request set pointed by \a data | |
1966 | * Callback used when waiting on sets with l_wait_event. | |
1967 | * Always returns 1. | |
1968 | */ | |
1969 | int ptlrpc_expired_set(void *data) | |
1970 | { | |
1971 | struct ptlrpc_request_set *set = data; | |
d0bfef31 CH |
1972 | struct list_head *tmp; |
1973 | time_t now = get_seconds(); | |
d7e09d03 PT |
1974 | |
1975 | LASSERT(set != NULL); | |
1976 | ||
1977 | /* | |
1978 | * A timeout expired. See which reqs it applies to... | |
1979 | */ | |
3949015e | 1980 | list_for_each(tmp, &set->set_requests) { |
d7e09d03 PT |
1981 | struct ptlrpc_request *req = |
1982 | list_entry(tmp, struct ptlrpc_request, | |
1983 | rq_set_chain); | |
1984 | ||
1985 | /* don't expire request waiting for context */ | |
1986 | if (req->rq_wait_ctx) | |
1987 | continue; | |
1988 | ||
1989 | /* Request in-flight? */ | |
1990 | if (!((req->rq_phase == RQ_PHASE_RPC && | |
1991 | !req->rq_waiting && !req->rq_resend) || | |
1992 | (req->rq_phase == RQ_PHASE_BULK))) | |
1993 | continue; | |
1994 | ||
1995 | if (req->rq_timedout || /* already dealt with */ | |
1996 | req->rq_deadline > now) /* not expired */ | |
1997 | continue; | |
1998 | ||
1999 | /* Deal with this guy. Do it asynchronously to not block | |
2000 | * ptlrpcd thread. */ | |
2001 | ptlrpc_expire_one_request(req, 1); | |
2002 | } | |
2003 | ||
2004 | /* | |
2005 | * When waiting for a whole set, we always break out of the | |
2006 | * sleep so we can recalculate the timeout, or enable interrupts | |
2007 | * if everyone's timed out. | |
2008 | */ | |
0a3bdb00 | 2009 | return 1; |
d7e09d03 PT |
2010 | } |
2011 | EXPORT_SYMBOL(ptlrpc_expired_set); | |
2012 | ||
2013 | /** | |
2014 | * Sets rq_intr flag in \a req under spinlock. | |
2015 | */ | |
2016 | void ptlrpc_mark_interrupted(struct ptlrpc_request *req) | |
2017 | { | |
2018 | spin_lock(&req->rq_lock); | |
2019 | req->rq_intr = 1; | |
2020 | spin_unlock(&req->rq_lock); | |
2021 | } | |
2022 | EXPORT_SYMBOL(ptlrpc_mark_interrupted); | |
2023 | ||
2024 | /** | |
2025 | * Interrupts (sets interrupted flag) all uncompleted requests in | |
2026 | * a set \a data. Callback for l_wait_event for interruptible waits. | |
2027 | */ | |
2028 | void ptlrpc_interrupted_set(void *data) | |
2029 | { | |
2030 | struct ptlrpc_request_set *set = data; | |
2031 | struct list_head *tmp; | |
2032 | ||
2033 | LASSERT(set != NULL); | |
2034 | CDEBUG(D_RPCTRACE, "INTERRUPTED SET %p\n", set); | |
2035 | ||
2036 | list_for_each(tmp, &set->set_requests) { | |
2037 | struct ptlrpc_request *req = | |
2038 | list_entry(tmp, struct ptlrpc_request, | |
2039 | rq_set_chain); | |
2040 | ||
2041 | if (req->rq_phase != RQ_PHASE_RPC && | |
2042 | req->rq_phase != RQ_PHASE_UNREGISTERING) | |
2043 | continue; | |
2044 | ||
2045 | ptlrpc_mark_interrupted(req); | |
2046 | } | |
2047 | } | |
2048 | EXPORT_SYMBOL(ptlrpc_interrupted_set); | |
2049 | ||
2050 | /** | |
2051 | * Get the smallest timeout in the set; this does NOT set a timeout. | |
2052 | */ | |
2053 | int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set) | |
2054 | { | |
d0bfef31 CH |
2055 | struct list_head *tmp; |
2056 | time_t now = get_seconds(); | |
2057 | int timeout = 0; | |
d7e09d03 | 2058 | struct ptlrpc_request *req; |
d0bfef31 | 2059 | int deadline; |
d7e09d03 | 2060 | |
d7e09d03 PT |
2061 | list_for_each(tmp, &set->set_requests) { |
2062 | req = list_entry(tmp, struct ptlrpc_request, rq_set_chain); | |
2063 | ||
2064 | /* | |
2065 | * Request in-flight? | |
2066 | */ | |
2067 | if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) || | |
2068 | (req->rq_phase == RQ_PHASE_BULK) || | |
2069 | (req->rq_phase == RQ_PHASE_NEW))) | |
2070 | continue; | |
2071 | ||
2072 | /* | |
2073 | * Already timed out. | |
2074 | */ | |
2075 | if (req->rq_timedout) | |
2076 | continue; | |
2077 | ||
2078 | /* | |
2079 | * Waiting for ctx. | |
2080 | */ | |
2081 | if (req->rq_wait_ctx) | |
2082 | continue; | |
2083 | ||
2084 | if (req->rq_phase == RQ_PHASE_NEW) | |
2085 | deadline = req->rq_sent; | |
2086 | else if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend) | |
2087 | deadline = req->rq_sent; | |
2088 | else | |
2089 | deadline = req->rq_sent + req->rq_timeout; | |
2090 | ||
2091 | if (deadline <= now) /* actually expired already */ | |
2092 | timeout = 1; /* ASAP */ | |
2093 | else if (timeout == 0 || timeout > deadline - now) | |
2094 | timeout = deadline - now; | |
2095 | } | |
0a3bdb00 | 2096 | return timeout; |
d7e09d03 PT |
2097 | } |
2098 | EXPORT_SYMBOL(ptlrpc_set_next_timeout); | |
2099 | ||
2100 | /** | |
930cef9a | 2101 | * Send all unset request from the set and then wait until all |
d7e09d03 PT |
2102 | * requests in the set complete (either get a reply, timeout, get an |
2103 | * error or otherwise be interrupted). | |
2104 | * Returns 0 on success or error code otherwise. | |
2105 | */ | |
2106 | int ptlrpc_set_wait(struct ptlrpc_request_set *set) | |
2107 | { | |
d0bfef31 | 2108 | struct list_head *tmp; |
d7e09d03 | 2109 | struct ptlrpc_request *req; |
d0bfef31 CH |
2110 | struct l_wait_info lwi; |
2111 | int rc, timeout; | |
d7e09d03 PT |
2112 | |
2113 | if (set->set_producer) | |
2114 | (void)ptlrpc_set_producer(set); | |
2115 | else | |
2116 | list_for_each(tmp, &set->set_requests) { | |
2117 | req = list_entry(tmp, struct ptlrpc_request, | |
2118 | rq_set_chain); | |
2119 | if (req->rq_phase == RQ_PHASE_NEW) | |
2120 | (void)ptlrpc_send_new_req(req); | |
2121 | } | |
2122 | ||
2123 | if (list_empty(&set->set_requests)) | |
0a3bdb00 | 2124 | return 0; |
d7e09d03 PT |
2125 | |
2126 | do { | |
2127 | timeout = ptlrpc_set_next_timeout(set); | |
2128 | ||
2129 | /* wait until all complete, interrupted, or an in-flight | |
2130 | * req times out */ | |
2131 | CDEBUG(D_RPCTRACE, "set %p going to sleep for %d seconds\n", | |
2132 | set, timeout); | |
2133 | ||
2134 | if (timeout == 0 && !cfs_signal_pending()) | |
2135 | /* | |
2136 | * No requests are in-flight (ether timed out | |
2137 | * or delayed), so we can allow interrupts. | |
2138 | * We still want to block for a limited time, | |
2139 | * so we allow interrupts during the timeout. | |
2140 | */ | |
2141 | lwi = LWI_TIMEOUT_INTR_ALL(cfs_time_seconds(1), | |
2142 | ptlrpc_expired_set, | |
2143 | ptlrpc_interrupted_set, set); | |
2144 | else | |
2145 | /* | |
2146 | * At least one request is in flight, so no | |
2147 | * interrupts are allowed. Wait until all | |
2148 | * complete, or an in-flight req times out. | |
2149 | */ | |
0ae015be | 2150 | lwi = LWI_TIMEOUT(cfs_time_seconds(timeout ? timeout : 1), |
d7e09d03 PT |
2151 | ptlrpc_expired_set, set); |
2152 | ||
2153 | rc = l_wait_event(set->set_waitq, ptlrpc_check_set(NULL, set), &lwi); | |
2154 | ||
2155 | /* LU-769 - if we ignored the signal because it was already | |
2156 | * pending when we started, we need to handle it now or we risk | |
2157 | * it being ignored forever */ | |
2158 | if (rc == -ETIMEDOUT && !lwi.lwi_allow_intr && | |
2159 | cfs_signal_pending()) { | |
2160 | sigset_t blocked_sigs = | |
2161 | cfs_block_sigsinv(LUSTRE_FATAL_SIGS); | |
2162 | ||
2163 | /* In fact we only interrupt for the "fatal" signals | |
2164 | * like SIGINT or SIGKILL. We still ignore less | |
2165 | * important signals since ptlrpc set is not easily | |
2166 | * reentrant from userspace again */ | |
2167 | if (cfs_signal_pending()) | |
2168 | ptlrpc_interrupted_set(set); | |
2169 | cfs_restore_sigs(blocked_sigs); | |
2170 | } | |
2171 | ||
2172 | LASSERT(rc == 0 || rc == -EINTR || rc == -ETIMEDOUT); | |
2173 | ||
2174 | /* -EINTR => all requests have been flagged rq_intr so next | |
2175 | * check completes. | |
2176 | * -ETIMEDOUT => someone timed out. When all reqs have | |
2177 | * timed out, signals are enabled allowing completion with | |
2178 | * EINTR. | |
2179 | * I don't really care if we go once more round the loop in | |
2180 | * the error cases -eeb. */ | |
2181 | if (rc == 0 && atomic_read(&set->set_remaining) == 0) { | |
2182 | list_for_each(tmp, &set->set_requests) { | |
2183 | req = list_entry(tmp, struct ptlrpc_request, | |
2184 | rq_set_chain); | |
2185 | spin_lock(&req->rq_lock); | |
2186 | req->rq_invalid_rqset = 1; | |
2187 | spin_unlock(&req->rq_lock); | |
2188 | } | |
2189 | } | |
2190 | } while (rc != 0 || atomic_read(&set->set_remaining) != 0); | |
2191 | ||
2192 | LASSERT(atomic_read(&set->set_remaining) == 0); | |
2193 | ||
2194 | rc = set->set_rc; /* rq_status of already freed requests if any */ | |
2195 | list_for_each(tmp, &set->set_requests) { | |
2196 | req = list_entry(tmp, struct ptlrpc_request, rq_set_chain); | |
2197 | ||
2198 | LASSERT(req->rq_phase == RQ_PHASE_COMPLETE); | |
2199 | if (req->rq_status != 0) | |
2200 | rc = req->rq_status; | |
2201 | } | |
2202 | ||
2203 | if (set->set_interpret != NULL) { | |
0ae015be | 2204 | int (*interpreter)(struct ptlrpc_request_set *set, void *, int) = |
d7e09d03 | 2205 | set->set_interpret; |
295ce74f | 2206 | rc = interpreter(set, set->set_arg, rc); |
d7e09d03 PT |
2207 | } else { |
2208 | struct ptlrpc_set_cbdata *cbdata, *n; | |
2209 | int err; | |
2210 | ||
2211 | list_for_each_entry_safe(cbdata, n, | |
2212 | &set->set_cblist, psc_item) { | |
2213 | list_del_init(&cbdata->psc_item); | |
2214 | err = cbdata->psc_interpret(set, cbdata->psc_data, rc); | |
2215 | if (err && !rc) | |
2216 | rc = err; | |
9ae10597 | 2217 | kfree(cbdata); |
d7e09d03 PT |
2218 | } |
2219 | } | |
2220 | ||
0a3bdb00 | 2221 | return rc; |
d7e09d03 PT |
2222 | } |
2223 | EXPORT_SYMBOL(ptlrpc_set_wait); | |
2224 | ||
2225 | /** | |
930cef9a | 2226 | * Helper function for request freeing. |
d7e09d03 PT |
2227 | * Called when request count reached zero and request needs to be freed. |
2228 | * Removes request from all sorts of sending/replay lists it might be on, | |
2229 | * frees network buffers if any are present. | |
2230 | * If \a locked is set, that means caller is already holding import imp_lock | |
2231 | * and so we no longer need to reobtain it (for certain lists manipulations) | |
2232 | */ | |
2233 | static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked) | |
2234 | { | |
3ff28049 | 2235 | if (request == NULL) |
d7e09d03 | 2236 | return; |
d7e09d03 | 2237 | LASSERTF(!request->rq_receiving_reply, "req %p\n", request); |
0ae015be | 2238 | LASSERTF(request->rq_rqbd == NULL, "req %p\n", request);/* client-side */ |
d7e09d03 PT |
2239 | LASSERTF(list_empty(&request->rq_list), "req %p\n", request); |
2240 | LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request); | |
2241 | LASSERTF(list_empty(&request->rq_exp_list), "req %p\n", request); | |
2242 | LASSERTF(!request->rq_replay, "req %p\n", request); | |
2243 | ||
2244 | req_capsule_fini(&request->rq_pill); | |
2245 | ||
2246 | /* We must take it off the imp_replay_list first. Otherwise, we'll set | |
2247 | * request->rq_reqmsg to NULL while osc_close is dereferencing it. */ | |
2248 | if (request->rq_import != NULL) { | |
2249 | if (!locked) | |
2250 | spin_lock(&request->rq_import->imp_lock); | |
2251 | list_del_init(&request->rq_replay_list); | |
2252 | if (!locked) | |
2253 | spin_unlock(&request->rq_import->imp_lock); | |
2254 | } | |
2255 | LASSERTF(list_empty(&request->rq_replay_list), "req %p\n", request); | |
2256 | ||
2257 | if (atomic_read(&request->rq_refcount) != 0) { | |
2258 | DEBUG_REQ(D_ERROR, request, | |
2259 | "freeing request with nonzero refcount"); | |
2260 | LBUG(); | |
2261 | } | |
2262 | ||
2263 | if (request->rq_repbuf != NULL) | |
2264 | sptlrpc_cli_free_repbuf(request); | |
2265 | if (request->rq_export != NULL) { | |
2266 | class_export_put(request->rq_export); | |
2267 | request->rq_export = NULL; | |
2268 | } | |
2269 | if (request->rq_import != NULL) { | |
2270 | class_import_put(request->rq_import); | |
2271 | request->rq_import = NULL; | |
2272 | } | |
2273 | if (request->rq_bulk != NULL) | |
2274 | ptlrpc_free_bulk_pin(request->rq_bulk); | |
2275 | ||
2276 | if (request->rq_reqbuf != NULL || request->rq_clrbuf != NULL) | |
2277 | sptlrpc_cli_free_reqbuf(request); | |
2278 | ||
2279 | if (request->rq_cli_ctx) | |
2280 | sptlrpc_req_put_ctx(request, !locked); | |
2281 | ||
2282 | if (request->rq_pool) | |
2283 | __ptlrpc_free_req_to_pool(request); | |
2284 | else | |
35b2e1b7 | 2285 | ptlrpc_request_cache_free(request); |
d7e09d03 PT |
2286 | } |
2287 | ||
2288 | static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked); | |
2289 | /** | |
2290 | * Drop one request reference. Must be called with import imp_lock held. | |
930cef9a | 2291 | * When reference count drops to zero, request is freed. |
d7e09d03 PT |
2292 | */ |
2293 | void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request) | |
2294 | { | |
5e42bc9d | 2295 | assert_spin_locked(&request->rq_import->imp_lock); |
d7e09d03 PT |
2296 | (void)__ptlrpc_req_finished(request, 1); |
2297 | } | |
2298 | EXPORT_SYMBOL(ptlrpc_req_finished_with_imp_lock); | |
2299 | ||
2300 | /** | |
2301 | * Helper function | |
2302 | * Drops one reference count for request \a request. | |
2303 | * \a locked set indicates that caller holds import imp_lock. | |
930cef9a | 2304 | * Frees the request when reference count reaches zero. |
d7e09d03 PT |
2305 | */ |
2306 | static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked) | |
2307 | { | |
d7e09d03 | 2308 | if (request == NULL) |
0a3bdb00 | 2309 | return 1; |
d7e09d03 PT |
2310 | |
2311 | if (request == LP_POISON || | |
2312 | request->rq_reqmsg == LP_POISON) { | |
2313 | CERROR("dereferencing freed request (bug 575)\n"); | |
2314 | LBUG(); | |
0a3bdb00 | 2315 | return 1; |
d7e09d03 PT |
2316 | } |
2317 | ||
2318 | DEBUG_REQ(D_INFO, request, "refcount now %u", | |
2319 | atomic_read(&request->rq_refcount) - 1); | |
2320 | ||
2321 | if (atomic_dec_and_test(&request->rq_refcount)) { | |
2322 | __ptlrpc_free_req(request, locked); | |
0a3bdb00 | 2323 | return 1; |
d7e09d03 PT |
2324 | } |
2325 | ||
0a3bdb00 | 2326 | return 0; |
d7e09d03 PT |
2327 | } |
2328 | ||
2329 | /** | |
2330 | * Drops one reference count for a request. | |
2331 | */ | |
2332 | void ptlrpc_req_finished(struct ptlrpc_request *request) | |
2333 | { | |
2334 | __ptlrpc_req_finished(request, 0); | |
2335 | } | |
2336 | EXPORT_SYMBOL(ptlrpc_req_finished); | |
2337 | ||
2338 | /** | |
2339 | * Returns xid of a \a request | |
2340 | */ | |
2341 | __u64 ptlrpc_req_xid(struct ptlrpc_request *request) | |
2342 | { | |
2343 | return request->rq_xid; | |
2344 | } | |
2345 | EXPORT_SYMBOL(ptlrpc_req_xid); | |
2346 | ||
2347 | /** | |
2348 | * Disengage the client's reply buffer from the network | |
2349 | * NB does _NOT_ unregister any client-side bulk. | |
2350 | * IDEMPOTENT, but _not_ safe against concurrent callers. | |
2351 | * The request owner (i.e. the thread doing the I/O) must call... | |
2352 | * Returns 0 on success or 1 if unregistering cannot be made. | |
2353 | */ | |
2354 | int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async) | |
2355 | { | |
d0bfef31 CH |
2356 | int rc; |
2357 | wait_queue_head_t *wq; | |
d7e09d03 PT |
2358 | struct l_wait_info lwi; |
2359 | ||
2360 | /* | |
2361 | * Might sleep. | |
2362 | */ | |
2363 | LASSERT(!in_interrupt()); | |
2364 | ||
2365 | /* | |
2366 | * Let's setup deadline for reply unlink. | |
2367 | */ | |
2368 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && | |
2369 | async && request->rq_reply_deadline == 0) | |
7264b8a5 | 2370 | request->rq_reply_deadline = get_seconds()+LONG_UNLINK; |
d7e09d03 PT |
2371 | |
2372 | /* | |
2373 | * Nothing left to do. | |
2374 | */ | |
2375 | if (!ptlrpc_client_recv_or_unlink(request)) | |
0a3bdb00 | 2376 | return 1; |
d7e09d03 PT |
2377 | |
2378 | LNetMDUnlink(request->rq_reply_md_h); | |
2379 | ||
2380 | /* | |
2381 | * Let's check it once again. | |
2382 | */ | |
2383 | if (!ptlrpc_client_recv_or_unlink(request)) | |
0a3bdb00 | 2384 | return 1; |
d7e09d03 PT |
2385 | |
2386 | /* | |
2387 | * Move to "Unregistering" phase as reply was not unlinked yet. | |
2388 | */ | |
2389 | ptlrpc_rqphase_move(request, RQ_PHASE_UNREGISTERING); | |
2390 | ||
2391 | /* | |
2392 | * Do not wait for unlink to finish. | |
2393 | */ | |
2394 | if (async) | |
0a3bdb00 | 2395 | return 0; |
d7e09d03 PT |
2396 | |
2397 | /* | |
2398 | * We have to l_wait_event() whatever the result, to give liblustre | |
2399 | * a chance to run reply_in_callback(), and to make sure we've | |
2400 | * unlinked before returning a req to the pool. | |
2401 | */ | |
2402 | if (request->rq_set != NULL) | |
2403 | wq = &request->rq_set->set_waitq; | |
2404 | else | |
2405 | wq = &request->rq_reply_waitq; | |
2406 | ||
2407 | for (;;) { | |
2408 | /* Network access will complete in finite time but the HUGE | |
2409 | * timeout lets us CWARN for visibility of sluggish NALs */ | |
2410 | lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK), | |
2411 | cfs_time_seconds(1), NULL, NULL); | |
2412 | rc = l_wait_event(*wq, !ptlrpc_client_recv_or_unlink(request), | |
2413 | &lwi); | |
2414 | if (rc == 0) { | |
2415 | ptlrpc_rqphase_move(request, request->rq_next_phase); | |
0a3bdb00 | 2416 | return 1; |
d7e09d03 PT |
2417 | } |
2418 | ||
2419 | LASSERT(rc == -ETIMEDOUT); | |
cf378ff7 AL |
2420 | DEBUG_REQ(D_WARNING, request, |
2421 | "Unexpectedly long timeout rvcng=%d unlnk=%d/%d", | |
2422 | request->rq_receiving_reply, | |
2423 | request->rq_req_unlink, request->rq_reply_unlink); | |
d7e09d03 | 2424 | } |
0a3bdb00 | 2425 | return 0; |
d7e09d03 PT |
2426 | } |
2427 | EXPORT_SYMBOL(ptlrpc_unregister_reply); | |
2428 | ||
63d42578 HZ |
2429 | static void ptlrpc_free_request(struct ptlrpc_request *req) |
2430 | { | |
2431 | spin_lock(&req->rq_lock); | |
2432 | req->rq_replay = 0; | |
2433 | spin_unlock(&req->rq_lock); | |
2434 | ||
2435 | if (req->rq_commit_cb != NULL) | |
2436 | req->rq_commit_cb(req); | |
2437 | list_del_init(&req->rq_replay_list); | |
2438 | ||
2439 | __ptlrpc_req_finished(req, 1); | |
2440 | } | |
2441 | ||
2442 | /** | |
2443 | * the request is committed and dropped from the replay list of its import | |
2444 | */ | |
2445 | void ptlrpc_request_committed(struct ptlrpc_request *req, int force) | |
2446 | { | |
2447 | struct obd_import *imp = req->rq_import; | |
2448 | ||
2449 | spin_lock(&imp->imp_lock); | |
2450 | if (list_empty(&req->rq_replay_list)) { | |
2451 | spin_unlock(&imp->imp_lock); | |
2452 | return; | |
2453 | } | |
2454 | ||
2455 | if (force || req->rq_transno <= imp->imp_peer_committed_transno) | |
2456 | ptlrpc_free_request(req); | |
2457 | ||
2458 | spin_unlock(&imp->imp_lock); | |
2459 | } | |
2460 | EXPORT_SYMBOL(ptlrpc_request_committed); | |
2461 | ||
d7e09d03 PT |
2462 | /** |
2463 | * Iterates through replay_list on import and prunes | |
2464 | * all requests have transno smaller than last_committed for the | |
2465 | * import and don't have rq_replay set. | |
930cef9a | 2466 | * Since requests are sorted in transno order, stops when meeting first |
d7e09d03 PT |
2467 | * transno bigger than last_committed. |
2468 | * caller must hold imp->imp_lock | |
2469 | */ | |
2470 | void ptlrpc_free_committed(struct obd_import *imp) | |
2471 | { | |
63d42578 | 2472 | struct ptlrpc_request *req, *saved; |
d7e09d03 | 2473 | struct ptlrpc_request *last_req = NULL; /* temporary fire escape */ |
d0bfef31 | 2474 | bool skip_committed_list = true; |
d7e09d03 PT |
2475 | |
2476 | LASSERT(imp != NULL); | |
5e42bc9d | 2477 | assert_spin_locked(&imp->imp_lock); |
d7e09d03 PT |
2478 | |
2479 | if (imp->imp_peer_committed_transno == imp->imp_last_transno_checked && | |
2480 | imp->imp_generation == imp->imp_last_generation_checked) { | |
b0f5aad5 | 2481 | CDEBUG(D_INFO, "%s: skip recheck: last_committed %llu\n", |
d7e09d03 | 2482 | imp->imp_obd->obd_name, imp->imp_peer_committed_transno); |
d7e09d03 PT |
2483 | return; |
2484 | } | |
b0f5aad5 | 2485 | CDEBUG(D_RPCTRACE, "%s: committing for last_committed %llu gen %d\n", |
d7e09d03 PT |
2486 | imp->imp_obd->obd_name, imp->imp_peer_committed_transno, |
2487 | imp->imp_generation); | |
63d42578 HZ |
2488 | |
2489 | if (imp->imp_generation != imp->imp_last_generation_checked) | |
2490 | skip_committed_list = false; | |
2491 | ||
d7e09d03 PT |
2492 | imp->imp_last_transno_checked = imp->imp_peer_committed_transno; |
2493 | imp->imp_last_generation_checked = imp->imp_generation; | |
2494 | ||
63d42578 HZ |
2495 | list_for_each_entry_safe(req, saved, &imp->imp_replay_list, |
2496 | rq_replay_list) { | |
d7e09d03 PT |
2497 | /* XXX ok to remove when 1357 resolved - rread 05/29/03 */ |
2498 | LASSERT(req != last_req); | |
2499 | last_req = req; | |
2500 | ||
2501 | if (req->rq_transno == 0) { | |
2502 | DEBUG_REQ(D_EMERG, req, "zero transno during replay"); | |
2503 | LBUG(); | |
2504 | } | |
2505 | if (req->rq_import_generation < imp->imp_generation) { | |
2506 | DEBUG_REQ(D_RPCTRACE, req, "free request with old gen"); | |
a9b3e8f3 | 2507 | goto free_req; |
d7e09d03 PT |
2508 | } |
2509 | ||
d7e09d03 PT |
2510 | /* not yet committed */ |
2511 | if (req->rq_transno > imp->imp_peer_committed_transno) { | |
2512 | DEBUG_REQ(D_RPCTRACE, req, "stopping search"); | |
2513 | break; | |
2514 | } | |
2515 | ||
63d42578 HZ |
2516 | if (req->rq_replay) { |
2517 | DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)"); | |
2518 | list_move_tail(&req->rq_replay_list, | |
2519 | &imp->imp_committed_list); | |
2520 | continue; | |
2521 | } | |
2522 | ||
b0f5aad5 | 2523 | DEBUG_REQ(D_INFO, req, "commit (last_committed %llu)", |
d7e09d03 PT |
2524 | imp->imp_peer_committed_transno); |
2525 | free_req: | |
63d42578 HZ |
2526 | ptlrpc_free_request(req); |
2527 | } | |
2528 | if (skip_committed_list) | |
2529 | return; | |
2530 | ||
2531 | list_for_each_entry_safe(req, saved, &imp->imp_committed_list, | |
2532 | rq_replay_list) { | |
2533 | LASSERT(req->rq_transno != 0); | |
2534 | if (req->rq_import_generation < imp->imp_generation) { | |
2535 | DEBUG_REQ(D_RPCTRACE, req, "free stale open request"); | |
2536 | ptlrpc_free_request(req); | |
2537 | } | |
d7e09d03 | 2538 | } |
d7e09d03 PT |
2539 | } |
2540 | ||
2541 | void ptlrpc_cleanup_client(struct obd_import *imp) | |
2542 | { | |
d7e09d03 PT |
2543 | } |
2544 | EXPORT_SYMBOL(ptlrpc_cleanup_client); | |
2545 | ||
2546 | /** | |
2547 | * Schedule previously sent request for resend. | |
2548 | * For bulk requests we assign new xid (to avoid problems with | |
2549 | * lost replies and therefore several transfers landing into same buffer | |
2550 | * from different sending attempts). | |
2551 | */ | |
2552 | void ptlrpc_resend_req(struct ptlrpc_request *req) | |
2553 | { | |
2554 | DEBUG_REQ(D_HA, req, "going to resend"); | |
5c689e68 AB |
2555 | spin_lock(&req->rq_lock); |
2556 | ||
2557 | /* Request got reply but linked to the import list still. | |
2558 | Let ptlrpc_check_set() to process it. */ | |
2559 | if (ptlrpc_client_replied(req)) { | |
2560 | spin_unlock(&req->rq_lock); | |
2561 | DEBUG_REQ(D_HA, req, "it has reply, so skip it"); | |
2562 | return; | |
2563 | } | |
2564 | ||
d7e09d03 PT |
2565 | lustre_msg_set_handle(req->rq_reqmsg, &(struct lustre_handle){ 0 }); |
2566 | req->rq_status = -EAGAIN; | |
2567 | ||
d7e09d03 PT |
2568 | req->rq_resend = 1; |
2569 | req->rq_net_err = 0; | |
2570 | req->rq_timedout = 0; | |
2571 | if (req->rq_bulk) { | |
2572 | __u64 old_xid = req->rq_xid; | |
2573 | ||
2574 | /* ensure previous bulk fails */ | |
2575 | req->rq_xid = ptlrpc_next_xid(); | |
b0f5aad5 | 2576 | CDEBUG(D_HA, "resend bulk old x%llu new x%llu\n", |
d7e09d03 PT |
2577 | old_xid, req->rq_xid); |
2578 | } | |
2579 | ptlrpc_client_wake_req(req); | |
2580 | spin_unlock(&req->rq_lock); | |
2581 | } | |
2582 | EXPORT_SYMBOL(ptlrpc_resend_req); | |
2583 | ||
2584 | /* XXX: this function and rq_status are currently unused */ | |
2585 | void ptlrpc_restart_req(struct ptlrpc_request *req) | |
2586 | { | |
2587 | DEBUG_REQ(D_HA, req, "restarting (possibly-)completed request"); | |
2588 | req->rq_status = -ERESTARTSYS; | |
2589 | ||
2590 | spin_lock(&req->rq_lock); | |
2591 | req->rq_restart = 1; | |
2592 | req->rq_timedout = 0; | |
2593 | ptlrpc_client_wake_req(req); | |
2594 | spin_unlock(&req->rq_lock); | |
2595 | } | |
2596 | EXPORT_SYMBOL(ptlrpc_restart_req); | |
2597 | ||
2598 | /** | |
2599 | * Grab additional reference on a request \a req | |
2600 | */ | |
2601 | struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req) | |
2602 | { | |
d7e09d03 | 2603 | atomic_inc(&req->rq_refcount); |
0a3bdb00 | 2604 | return req; |
d7e09d03 PT |
2605 | } |
2606 | EXPORT_SYMBOL(ptlrpc_request_addref); | |
2607 | ||
2608 | /** | |
2609 | * Add a request to import replay_list. | |
2610 | * Must be called under imp_lock | |
2611 | */ | |
2612 | void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, | |
2613 | struct obd_import *imp) | |
2614 | { | |
2615 | struct list_head *tmp; | |
2616 | ||
5e42bc9d | 2617 | assert_spin_locked(&imp->imp_lock); |
d7e09d03 PT |
2618 | |
2619 | if (req->rq_transno == 0) { | |
2620 | DEBUG_REQ(D_EMERG, req, "saving request with zero transno"); | |
2621 | LBUG(); | |
2622 | } | |
2623 | ||
2624 | /* clear this for new requests that were resent as well | |
2625 | as resent replayed requests. */ | |
2626 | lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT); | |
2627 | ||
2628 | /* don't re-add requests that have been replayed */ | |
2629 | if (!list_empty(&req->rq_replay_list)) | |
2630 | return; | |
2631 | ||
2632 | lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY); | |
2633 | ||
2634 | LASSERT(imp->imp_replayable); | |
2635 | /* Balanced in ptlrpc_free_committed, usually. */ | |
2636 | ptlrpc_request_addref(req); | |
2637 | list_for_each_prev(tmp, &imp->imp_replay_list) { | |
2638 | struct ptlrpc_request *iter = | |
2639 | list_entry(tmp, struct ptlrpc_request, | |
2640 | rq_replay_list); | |
2641 | ||
2642 | /* We may have duplicate transnos if we create and then | |
2643 | * open a file, or for closes retained if to match creating | |
2644 | * opens, so use req->rq_xid as a secondary key. | |
2645 | * (See bugs 684, 685, and 428.) | |
2646 | * XXX no longer needed, but all opens need transnos! | |
2647 | */ | |
2648 | if (iter->rq_transno > req->rq_transno) | |
2649 | continue; | |
2650 | ||
2651 | if (iter->rq_transno == req->rq_transno) { | |
2652 | LASSERT(iter->rq_xid != req->rq_xid); | |
2653 | if (iter->rq_xid > req->rq_xid) | |
2654 | continue; | |
2655 | } | |
2656 | ||
2657 | list_add(&req->rq_replay_list, &iter->rq_replay_list); | |
2658 | return; | |
2659 | } | |
2660 | ||
2661 | list_add(&req->rq_replay_list, &imp->imp_replay_list); | |
2662 | } | |
2663 | EXPORT_SYMBOL(ptlrpc_retain_replayable_request); | |
2664 | ||
2665 | /** | |
2666 | * Send request and wait until it completes. | |
2667 | * Returns request processing status. | |
2668 | */ | |
2669 | int ptlrpc_queue_wait(struct ptlrpc_request *req) | |
2670 | { | |
2671 | struct ptlrpc_request_set *set; | |
2672 | int rc; | |
d7e09d03 PT |
2673 | |
2674 | LASSERT(req->rq_set == NULL); | |
2675 | LASSERT(!req->rq_receiving_reply); | |
2676 | ||
2677 | set = ptlrpc_prep_set(); | |
2678 | if (set == NULL) { | |
2679 | CERROR("Unable to allocate ptlrpc set."); | |
0a3bdb00 | 2680 | return -ENOMEM; |
d7e09d03 PT |
2681 | } |
2682 | ||
2683 | /* for distributed debugging */ | |
2684 | lustre_msg_set_status(req->rq_reqmsg, current_pid()); | |
2685 | ||
2686 | /* add a ref for the set (see comment in ptlrpc_set_add_req) */ | |
2687 | ptlrpc_request_addref(req); | |
2688 | ptlrpc_set_add_req(set, req); | |
2689 | rc = ptlrpc_set_wait(set); | |
2690 | ptlrpc_set_destroy(set); | |
2691 | ||
0a3bdb00 | 2692 | return rc; |
d7e09d03 PT |
2693 | } |
2694 | EXPORT_SYMBOL(ptlrpc_queue_wait); | |
2695 | ||
2696 | struct ptlrpc_replay_async_args { | |
2697 | int praa_old_state; | |
2698 | int praa_old_status; | |
2699 | }; | |
2700 | ||
2701 | /** | |
2702 | * Callback used for replayed requests reply processing. | |
930cef9a | 2703 | * In case of successful reply calls registered request replay callback. |
d7e09d03 PT |
2704 | * In case of error restart replay process. |
2705 | */ | |
2706 | static int ptlrpc_replay_interpret(const struct lu_env *env, | |
2707 | struct ptlrpc_request *req, | |
0028d585 | 2708 | void *data, int rc) |
d7e09d03 PT |
2709 | { |
2710 | struct ptlrpc_replay_async_args *aa = data; | |
2711 | struct obd_import *imp = req->rq_import; | |
2712 | ||
d7e09d03 PT |
2713 | atomic_dec(&imp->imp_replay_inflight); |
2714 | ||
2715 | if (!ptlrpc_client_replied(req)) { | |
2716 | CERROR("request replay timed out, restarting recovery\n"); | |
a9b3e8f3 JL |
2717 | rc = -ETIMEDOUT; |
2718 | goto out; | |
d7e09d03 PT |
2719 | } |
2720 | ||
2721 | if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR && | |
2722 | (lustre_msg_get_status(req->rq_repmsg) == -ENOTCONN || | |
a9b3e8f3 JL |
2723 | lustre_msg_get_status(req->rq_repmsg) == -ENODEV)) { |
2724 | rc = lustre_msg_get_status(req->rq_repmsg); | |
2725 | goto out; | |
2726 | } | |
d7e09d03 PT |
2727 | |
2728 | /** VBR: check version failure */ | |
2729 | if (lustre_msg_get_status(req->rq_repmsg) == -EOVERFLOW) { | |
2730 | /** replay was failed due to version mismatch */ | |
2731 | DEBUG_REQ(D_WARNING, req, "Version mismatch during replay\n"); | |
2732 | spin_lock(&imp->imp_lock); | |
2733 | imp->imp_vbr_failed = 1; | |
2734 | imp->imp_no_lock_replay = 1; | |
2735 | spin_unlock(&imp->imp_lock); | |
2736 | lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status); | |
2737 | } else { | |
2738 | /** The transno had better not change over replay. */ | |
2739 | LASSERTF(lustre_msg_get_transno(req->rq_reqmsg) == | |
2740 | lustre_msg_get_transno(req->rq_repmsg) || | |
2741 | lustre_msg_get_transno(req->rq_repmsg) == 0, | |
55f5a824 | 2742 | "%#llx/%#llx\n", |
d7e09d03 PT |
2743 | lustre_msg_get_transno(req->rq_reqmsg), |
2744 | lustre_msg_get_transno(req->rq_repmsg)); | |
2745 | } | |
2746 | ||
2747 | spin_lock(&imp->imp_lock); | |
2748 | /** if replays by version then gap occur on server, no trust to locks */ | |
2749 | if (lustre_msg_get_flags(req->rq_repmsg) & MSG_VERSION_REPLAY) | |
2750 | imp->imp_no_lock_replay = 1; | |
2751 | imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg); | |
2752 | spin_unlock(&imp->imp_lock); | |
2753 | LASSERT(imp->imp_last_replay_transno); | |
2754 | ||
2755 | /* transaction number shouldn't be bigger than the latest replayed */ | |
2756 | if (req->rq_transno > lustre_msg_get_transno(req->rq_reqmsg)) { | |
2757 | DEBUG_REQ(D_ERROR, req, | |
b0f5aad5 GKH |
2758 | "Reported transno %llu is bigger than the replayed one: %llu", |
2759 | req->rq_transno, | |
d7e09d03 | 2760 | lustre_msg_get_transno(req->rq_reqmsg)); |
a9b3e8f3 JL |
2761 | rc = -EINVAL; |
2762 | goto out; | |
d7e09d03 PT |
2763 | } |
2764 | ||
2765 | DEBUG_REQ(D_HA, req, "got rep"); | |
2766 | ||
2767 | /* let the callback do fixups, possibly including in the request */ | |
2768 | if (req->rq_replay_cb) | |
2769 | req->rq_replay_cb(req); | |
2770 | ||
2771 | if (ptlrpc_client_replied(req) && | |
2772 | lustre_msg_get_status(req->rq_repmsg) != aa->praa_old_status) { | |
2773 | DEBUG_REQ(D_ERROR, req, "status %d, old was %d", | |
2774 | lustre_msg_get_status(req->rq_repmsg), | |
2775 | aa->praa_old_status); | |
2776 | } else { | |
2777 | /* Put it back for re-replay. */ | |
2778 | lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status); | |
2779 | } | |
2780 | ||
2781 | /* | |
2782 | * Errors while replay can set transno to 0, but | |
2783 | * imp_last_replay_transno shouldn't be set to 0 anyway | |
2784 | */ | |
2785 | if (req->rq_transno == 0) | |
2786 | CERROR("Transno is 0 during replay!\n"); | |
2787 | ||
2788 | /* continue with recovery */ | |
2789 | rc = ptlrpc_import_recovery_state_machine(imp); | |
2790 | out: | |
2791 | req->rq_send_state = aa->praa_old_state; | |
2792 | ||
2793 | if (rc != 0) | |
2794 | /* this replay failed, so restart recovery */ | |
2795 | ptlrpc_connect_import(imp); | |
2796 | ||
0a3bdb00 | 2797 | return rc; |
d7e09d03 PT |
2798 | } |
2799 | ||
2800 | /** | |
2801 | * Prepares and queues request for replay. | |
2802 | * Adds it to ptlrpcd queue for actual sending. | |
2803 | * Returns 0 on success. | |
2804 | */ | |
2805 | int ptlrpc_replay_req(struct ptlrpc_request *req) | |
2806 | { | |
2807 | struct ptlrpc_replay_async_args *aa; | |
d7e09d03 PT |
2808 | |
2809 | LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY); | |
2810 | ||
3949015e | 2811 | LASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); |
d7e09d03 | 2812 | aa = ptlrpc_req_async_args(req); |
ec83e611 | 2813 | memset(aa, 0, sizeof(*aa)); |
d7e09d03 PT |
2814 | |
2815 | /* Prepare request to be resent with ptlrpcd */ | |
2816 | aa->praa_old_state = req->rq_send_state; | |
2817 | req->rq_send_state = LUSTRE_IMP_REPLAY; | |
2818 | req->rq_phase = RQ_PHASE_NEW; | |
2819 | req->rq_next_phase = RQ_PHASE_UNDEFINED; | |
2820 | if (req->rq_repmsg) | |
2821 | aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg); | |
2822 | req->rq_status = 0; | |
2823 | req->rq_interpret_reply = ptlrpc_replay_interpret; | |
2824 | /* Readjust the timeout for current conditions */ | |
2825 | ptlrpc_at_set_req_timeout(req); | |
2826 | ||
2827 | /* Tell server the net_latency, so the server can calculate how long | |
2828 | * it should wait for next replay */ | |
2829 | lustre_msg_set_service_time(req->rq_reqmsg, | |
2830 | ptlrpc_at_get_net_latency(req)); | |
2831 | DEBUG_REQ(D_HA, req, "REPLAY"); | |
2832 | ||
2833 | atomic_inc(&req->rq_import->imp_replay_inflight); | |
2834 | ptlrpc_request_addref(req); /* ptlrpcd needs a ref */ | |
2835 | ||
2836 | ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1); | |
0a3bdb00 | 2837 | return 0; |
d7e09d03 PT |
2838 | } |
2839 | EXPORT_SYMBOL(ptlrpc_replay_req); | |
2840 | ||
2841 | /** | |
2842 | * Aborts all in-flight request on import \a imp sending and delayed lists | |
2843 | */ | |
2844 | void ptlrpc_abort_inflight(struct obd_import *imp) | |
2845 | { | |
2846 | struct list_head *tmp, *n; | |
d7e09d03 PT |
2847 | |
2848 | /* Make sure that no new requests get processed for this import. | |
2849 | * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing | |
2850 | * this flag and then putting requests on sending_list or delayed_list. | |
2851 | */ | |
2852 | spin_lock(&imp->imp_lock); | |
2853 | ||
2854 | /* XXX locking? Maybe we should remove each request with the list | |
2855 | * locked? Also, how do we know if the requests on the list are | |
2856 | * being freed at this time? | |
2857 | */ | |
2858 | list_for_each_safe(tmp, n, &imp->imp_sending_list) { | |
2859 | struct ptlrpc_request *req = | |
2860 | list_entry(tmp, struct ptlrpc_request, rq_list); | |
2861 | ||
2862 | DEBUG_REQ(D_RPCTRACE, req, "inflight"); | |
2863 | ||
2864 | spin_lock(&req->rq_lock); | |
2865 | if (req->rq_import_generation < imp->imp_generation) { | |
2866 | req->rq_err = 1; | |
2867 | req->rq_status = -EIO; | |
2868 | ptlrpc_client_wake_req(req); | |
2869 | } | |
2870 | spin_unlock(&req->rq_lock); | |
2871 | } | |
2872 | ||
2873 | list_for_each_safe(tmp, n, &imp->imp_delayed_list) { | |
2874 | struct ptlrpc_request *req = | |
2875 | list_entry(tmp, struct ptlrpc_request, rq_list); | |
2876 | ||
2877 | DEBUG_REQ(D_RPCTRACE, req, "aborting waiting req"); | |
2878 | ||
2879 | spin_lock(&req->rq_lock); | |
2880 | if (req->rq_import_generation < imp->imp_generation) { | |
2881 | req->rq_err = 1; | |
2882 | req->rq_status = -EIO; | |
2883 | ptlrpc_client_wake_req(req); | |
2884 | } | |
2885 | spin_unlock(&req->rq_lock); | |
2886 | } | |
2887 | ||
2888 | /* Last chance to free reqs left on the replay list, but we | |
2889 | * will still leak reqs that haven't committed. */ | |
2890 | if (imp->imp_replayable) | |
2891 | ptlrpc_free_committed(imp); | |
2892 | ||
2893 | spin_unlock(&imp->imp_lock); | |
d7e09d03 PT |
2894 | } |
2895 | EXPORT_SYMBOL(ptlrpc_abort_inflight); | |
2896 | ||
2897 | /** | |
2898 | * Abort all uncompleted requests in request set \a set | |
2899 | */ | |
2900 | void ptlrpc_abort_set(struct ptlrpc_request_set *set) | |
2901 | { | |
2902 | struct list_head *tmp, *pos; | |
2903 | ||
2904 | LASSERT(set != NULL); | |
2905 | ||
2906 | list_for_each_safe(pos, tmp, &set->set_requests) { | |
2907 | struct ptlrpc_request *req = | |
2908 | list_entry(pos, struct ptlrpc_request, | |
2909 | rq_set_chain); | |
2910 | ||
2911 | spin_lock(&req->rq_lock); | |
2912 | if (req->rq_phase != RQ_PHASE_RPC) { | |
2913 | spin_unlock(&req->rq_lock); | |
2914 | continue; | |
2915 | } | |
2916 | ||
2917 | req->rq_err = 1; | |
2918 | req->rq_status = -EINTR; | |
2919 | ptlrpc_client_wake_req(req); | |
2920 | spin_unlock(&req->rq_lock); | |
2921 | } | |
2922 | } | |
2923 | ||
2924 | static __u64 ptlrpc_last_xid; | |
2925 | static spinlock_t ptlrpc_last_xid_lock; | |
2926 | ||
2927 | /** | |
2928 | * Initialize the XID for the node. This is common among all requests on | |
2929 | * this node, and only requires the property that it is monotonically | |
2930 | * increasing. It does not need to be sequential. Since this is also used | |
2931 | * as the RDMA match bits, it is important that a single client NOT have | |
2932 | * the same match bits for two different in-flight requests, hence we do | |
2933 | * NOT want to have an XID per target or similar. | |
2934 | * | |
2935 | * To avoid an unlikely collision between match bits after a client reboot | |
2936 | * (which would deliver old data into the wrong RDMA buffer) initialize | |
2937 | * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s. | |
2938 | * If the time is clearly incorrect, we instead use a 62-bit random number. | |
2939 | * In the worst case the random number will overflow 1M RPCs per second in | |
2940 | * 9133 years, or permutations thereof. | |
2941 | */ | |
2942 | #define YEAR_2004 (1ULL << 30) | |
2943 | void ptlrpc_init_xid(void) | |
2944 | { | |
7264b8a5 | 2945 | time_t now = get_seconds(); |
d7e09d03 PT |
2946 | |
2947 | spin_lock_init(&ptlrpc_last_xid_lock); | |
2948 | if (now < YEAR_2004) { | |
2949 | cfs_get_random_bytes(&ptlrpc_last_xid, sizeof(ptlrpc_last_xid)); | |
2950 | ptlrpc_last_xid >>= 2; | |
2951 | ptlrpc_last_xid |= (1ULL << 61); | |
2952 | } else { | |
2953 | ptlrpc_last_xid = (__u64)now << 20; | |
2954 | } | |
2955 | ||
930cef9a | 2956 | /* Always need to be aligned to a power-of-two for multi-bulk BRW */ |
d7e09d03 PT |
2957 | CLASSERT((PTLRPC_BULK_OPS_COUNT & (PTLRPC_BULK_OPS_COUNT - 1)) == 0); |
2958 | ptlrpc_last_xid &= PTLRPC_BULK_OPS_MASK; | |
2959 | } | |
2960 | ||
2961 | /** | |
2962 | * Increase xid and returns resulting new value to the caller. | |
2963 | * | |
2964 | * Multi-bulk BRW RPCs consume multiple XIDs for each bulk transfer, starting | |
2965 | * at the returned xid, up to xid + PTLRPC_BULK_OPS_COUNT - 1. The BRW RPC | |
2966 | * itself uses the last bulk xid needed, so the server can determine the | |
2967 | * the number of bulk transfers from the RPC XID and a bitmask. The starting | |
2968 | * xid must align to a power-of-two value. | |
2969 | * | |
2970 | * This is assumed to be true due to the initial ptlrpc_last_xid | |
2971 | * value also being initialized to a power-of-two value. LU-1431 | |
2972 | */ | |
2973 | __u64 ptlrpc_next_xid(void) | |
2974 | { | |
2975 | __u64 next; | |
2976 | ||
2977 | spin_lock(&ptlrpc_last_xid_lock); | |
2978 | next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT; | |
2979 | ptlrpc_last_xid = next; | |
2980 | spin_unlock(&ptlrpc_last_xid_lock); | |
2981 | ||
2982 | return next; | |
2983 | } | |
2984 | EXPORT_SYMBOL(ptlrpc_next_xid); | |
2985 | ||
2986 | /** | |
2987 | * Get a glimpse at what next xid value might have been. | |
2988 | * Returns possible next xid. | |
2989 | */ | |
2990 | __u64 ptlrpc_sample_next_xid(void) | |
2991 | { | |
2992 | #if BITS_PER_LONG == 32 | |
2993 | /* need to avoid possible word tearing on 32-bit systems */ | |
2994 | __u64 next; | |
2995 | ||
2996 | spin_lock(&ptlrpc_last_xid_lock); | |
2997 | next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT; | |
2998 | spin_unlock(&ptlrpc_last_xid_lock); | |
2999 | ||
3000 | return next; | |
3001 | #else | |
3002 | /* No need to lock, since returned value is racy anyways */ | |
3003 | return ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT; | |
3004 | #endif | |
3005 | } | |
3006 | EXPORT_SYMBOL(ptlrpc_sample_next_xid); | |
3007 | ||
3008 | /** | |
3009 | * Functions for operating ptlrpc workers. | |
3010 | * | |
3011 | * A ptlrpc work is a function which will be running inside ptlrpc context. | |
3012 | * The callback shouldn't sleep otherwise it will block that ptlrpcd thread. | |
3013 | * | |
3014 | * 1. after a work is created, it can be used many times, that is: | |
3015 | * handler = ptlrpcd_alloc_work(); | |
3016 | * ptlrpcd_queue_work(); | |
3017 | * | |
3018 | * queue it again when necessary: | |
3019 | * ptlrpcd_queue_work(); | |
3020 | * ptlrpcd_destroy_work(); | |
3021 | * 2. ptlrpcd_queue_work() can be called by multiple processes meanwhile, but | |
3022 | * it will only be queued once in any time. Also as its name implies, it may | |
3023 | * have delay before it really runs by ptlrpcd thread. | |
3024 | */ | |
3025 | struct ptlrpc_work_async_args { | |
d0bfef31 CH |
3026 | int (*cb)(const struct lu_env *, void *); |
3027 | void *cbdata; | |
d7e09d03 PT |
3028 | }; |
3029 | ||
82a373ae LZ |
3030 | static void ptlrpcd_add_work_req(struct ptlrpc_request *req) |
3031 | { | |
3032 | /* re-initialize the req */ | |
3033 | req->rq_timeout = obd_timeout; | |
7264b8a5 | 3034 | req->rq_sent = get_seconds(); |
82a373ae LZ |
3035 | req->rq_deadline = req->rq_sent + req->rq_timeout; |
3036 | req->rq_reply_deadline = req->rq_deadline; | |
3037 | req->rq_phase = RQ_PHASE_INTERPRET; | |
3038 | req->rq_next_phase = RQ_PHASE_COMPLETE; | |
3039 | req->rq_xid = ptlrpc_next_xid(); | |
3040 | req->rq_import_generation = req->rq_import->imp_generation; | |
3041 | ||
3042 | ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1); | |
3043 | } | |
d7e09d03 PT |
3044 | |
3045 | static int work_interpreter(const struct lu_env *env, | |
3046 | struct ptlrpc_request *req, void *data, int rc) | |
3047 | { | |
3048 | struct ptlrpc_work_async_args *arg = data; | |
3049 | ||
82a373ae | 3050 | LASSERT(ptlrpcd_check_work(req)); |
d7e09d03 PT |
3051 | LASSERT(arg->cb != NULL); |
3052 | ||
82a373ae LZ |
3053 | rc = arg->cb(env, arg->cbdata); |
3054 | ||
3055 | list_del_init(&req->rq_set_chain); | |
3056 | req->rq_set = NULL; | |
3057 | ||
3058 | if (atomic_dec_return(&req->rq_refcount) > 1) { | |
3059 | atomic_set(&req->rq_refcount, 2); | |
3060 | ptlrpcd_add_work_req(req); | |
3061 | } | |
3062 | return rc; | |
3063 | } | |
3064 | ||
3065 | static int worker_format; | |
3066 | ||
3067 | static int ptlrpcd_check_work(struct ptlrpc_request *req) | |
3068 | { | |
3069 | return req->rq_pill.rc_fmt == (void *)&worker_format; | |
d7e09d03 PT |
3070 | } |
3071 | ||
3072 | /** | |
3073 | * Create a work for ptlrpc. | |
3074 | */ | |
3075 | void *ptlrpcd_alloc_work(struct obd_import *imp, | |
3076 | int (*cb)(const struct lu_env *, void *), void *cbdata) | |
3077 | { | |
3078 | struct ptlrpc_request *req = NULL; | |
3079 | struct ptlrpc_work_async_args *args; | |
d7e09d03 PT |
3080 | |
3081 | might_sleep(); | |
3082 | ||
3083 | if (cb == NULL) | |
0a3bdb00 | 3084 | return ERR_PTR(-EINVAL); |
d7e09d03 PT |
3085 | |
3086 | /* copy some code from deprecated fakereq. */ | |
0be19afa | 3087 | req = ptlrpc_request_cache_alloc(GFP_NOFS); |
d7e09d03 PT |
3088 | if (req == NULL) { |
3089 | CERROR("ptlrpc: run out of memory!\n"); | |
0a3bdb00 | 3090 | return ERR_PTR(-ENOMEM); |
d7e09d03 PT |
3091 | } |
3092 | ||
3093 | req->rq_send_state = LUSTRE_IMP_FULL; | |
3094 | req->rq_type = PTL_RPC_MSG_REQUEST; | |
3095 | req->rq_import = class_import_get(imp); | |
3096 | req->rq_export = NULL; | |
3097 | req->rq_interpret_reply = work_interpreter; | |
3098 | /* don't want reply */ | |
3099 | req->rq_receiving_reply = 0; | |
cf378ff7 | 3100 | req->rq_req_unlink = req->rq_reply_unlink = 0; |
d7e09d03 | 3101 | req->rq_no_delay = req->rq_no_resend = 1; |
82a373ae | 3102 | req->rq_pill.rc_fmt = (void *)&worker_format; |
d7e09d03 PT |
3103 | |
3104 | spin_lock_init(&req->rq_lock); | |
3105 | INIT_LIST_HEAD(&req->rq_list); | |
3106 | INIT_LIST_HEAD(&req->rq_replay_list); | |
3107 | INIT_LIST_HEAD(&req->rq_set_chain); | |
3108 | INIT_LIST_HEAD(&req->rq_history_list); | |
3109 | INIT_LIST_HEAD(&req->rq_exp_list); | |
3110 | init_waitqueue_head(&req->rq_reply_waitq); | |
3111 | init_waitqueue_head(&req->rq_set_waitq); | |
3112 | atomic_set(&req->rq_refcount, 1); | |
3113 | ||
3949015e | 3114 | CLASSERT(sizeof(*args) <= sizeof(req->rq_async_args)); |
d7e09d03 | 3115 | args = ptlrpc_req_async_args(req); |
d0bfef31 | 3116 | args->cb = cb; |
d7e09d03 PT |
3117 | args->cbdata = cbdata; |
3118 | ||
0a3bdb00 | 3119 | return req; |
d7e09d03 PT |
3120 | } |
3121 | EXPORT_SYMBOL(ptlrpcd_alloc_work); | |
3122 | ||
3123 | void ptlrpcd_destroy_work(void *handler) | |
3124 | { | |
3125 | struct ptlrpc_request *req = handler; | |
3126 | ||
3127 | if (req) | |
3128 | ptlrpc_req_finished(req); | |
3129 | } | |
3130 | EXPORT_SYMBOL(ptlrpcd_destroy_work); | |
3131 | ||
3132 | int ptlrpcd_queue_work(void *handler) | |
3133 | { | |
3134 | struct ptlrpc_request *req = handler; | |
3135 | ||
3136 | /* | |
3137 | * Check if the req is already being queued. | |
3138 | * | |
3139 | * Here comes a trick: it lacks a way of checking if a req is being | |
3140 | * processed reliably in ptlrpc. Here I have to use refcount of req | |
3141 | * for this purpose. This is okay because the caller should use this | |
3142 | * req as opaque data. - Jinshan | |
3143 | */ | |
3144 | LASSERT(atomic_read(&req->rq_refcount) > 0); | |
82a373ae LZ |
3145 | if (atomic_inc_return(&req->rq_refcount) == 2) |
3146 | ptlrpcd_add_work_req(req); | |
d7e09d03 PT |
3147 | return 0; |
3148 | } | |
3149 | EXPORT_SYMBOL(ptlrpcd_queue_work); |