Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
6a5b99a4 | 18 | * http://www.gnu.org/licenses/gpl-2.0.html |
d7e09d03 | 19 | * |
d7e09d03 PT |
20 | * GPL HEADER END |
21 | */ | |
22 | /* | |
23 | * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. | |
24 | * Use is subject to license terms. | |
25 | * | |
1dc563a6 | 26 | * Copyright (c) 2011, 2015, Intel Corporation. |
d7e09d03 PT |
27 | */ |
28 | /* | |
29 | * This file is part of Lustre, http://www.lustre.org/ | |
30 | * Lustre is a trademark of Sun Microsystems, Inc. | |
31 | * | |
32 | * lustre/ptlrpc/import.c | |
33 | * | |
34 | * Author: Mike Shaver <shaver@clusterfs.com> | |
35 | */ | |
36 | ||
37 | #define DEBUG_SUBSYSTEM S_RPC | |
38 | ||
e27db149 GKH |
39 | #include "../include/obd_support.h" |
40 | #include "../include/lustre_ha.h" | |
41 | #include "../include/lustre_net.h" | |
42 | #include "../include/lustre_import.h" | |
43 | #include "../include/lustre_export.h" | |
44 | #include "../include/obd.h" | |
45 | #include "../include/obd_cksum.h" | |
46 | #include "../include/obd_class.h" | |
d7e09d03 PT |
47 | |
48 | #include "ptlrpc_internal.h" | |
49 | ||
50 | struct ptlrpc_connect_async_args { | |
51 | __u64 pcaa_peer_committed; | |
52 | int pcaa_initial_connect; | |
53 | }; | |
54 | ||
55 | /** | |
56 | * Updates import \a imp current state to provided \a state value | |
57 | * Helper function. Must be called under imp_lock. | |
58 | */ | |
59 | static void __import_set_state(struct obd_import *imp, | |
60 | enum lustre_imp_state state) | |
61 | { | |
502cb58e AS |
62 | switch (state) { |
63 | case LUSTRE_IMP_CLOSED: | |
64 | case LUSTRE_IMP_NEW: | |
65 | case LUSTRE_IMP_DISCON: | |
66 | case LUSTRE_IMP_CONNECTING: | |
67 | break; | |
68 | case LUSTRE_IMP_REPLAY_WAIT: | |
69 | imp->imp_replay_state = LUSTRE_IMP_REPLAY_LOCKS; | |
70 | break; | |
71 | default: | |
72 | imp->imp_replay_state = LUSTRE_IMP_REPLAY; | |
73 | } | |
74 | ||
d7e09d03 PT |
75 | imp->imp_state = state; |
76 | imp->imp_state_hist[imp->imp_state_hist_idx].ish_state = state; | |
77 | imp->imp_state_hist[imp->imp_state_hist_idx].ish_time = | |
74e489aa | 78 | ktime_get_real_seconds(); |
d7e09d03 PT |
79 | imp->imp_state_hist_idx = (imp->imp_state_hist_idx + 1) % |
80 | IMP_STATE_HIST_LEN; | |
81 | } | |
82 | ||
83 | /* A CLOSED import should remain so. */ | |
532118c0 KM |
84 | #define IMPORT_SET_STATE_NOLOCK(imp, state) \ |
85 | do { \ | |
86 | if (imp->imp_state != LUSTRE_IMP_CLOSED) { \ | |
87 | CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n", \ | |
88 | imp, obd2cli_tgt(imp->imp_obd), \ | |
89 | ptlrpc_import_state_name(imp->imp_state), \ | |
90 | ptlrpc_import_state_name(state)); \ | |
91 | __import_set_state(imp, state); \ | |
92 | } \ | |
3949015e | 93 | } while (0) |
d7e09d03 PT |
94 | |
95 | #define IMPORT_SET_STATE(imp, state) \ | |
96 | do { \ | |
97 | spin_lock(&imp->imp_lock); \ | |
98 | IMPORT_SET_STATE_NOLOCK(imp, state); \ | |
99 | spin_unlock(&imp->imp_lock); \ | |
3949015e | 100 | } while (0) |
d7e09d03 | 101 | |
d7e09d03 PT |
102 | static int ptlrpc_connect_interpret(const struct lu_env *env, |
103 | struct ptlrpc_request *request, | |
aff9d8e8 | 104 | void *data, int rc); |
d7e09d03 PT |
105 | int ptlrpc_import_recovery_state_machine(struct obd_import *imp); |
106 | ||
107 | /* Only this function is allowed to change the import state when it is | |
108 | * CLOSED. I would rather refcount the import and free it after | |
109 | * disconnection like we do with exports. To do that, the client_obd | |
110 | * will need to save the peer info somewhere other than in the import, | |
dadfcdab OD |
111 | * though. |
112 | */ | |
d7e09d03 PT |
113 | int ptlrpc_init_import(struct obd_import *imp) |
114 | { | |
115 | spin_lock(&imp->imp_lock); | |
116 | ||
117 | imp->imp_generation++; | |
d0bfef31 | 118 | imp->imp_state = LUSTRE_IMP_NEW; |
d7e09d03 PT |
119 | |
120 | spin_unlock(&imp->imp_lock); | |
121 | ||
122 | return 0; | |
123 | } | |
124 | EXPORT_SYMBOL(ptlrpc_init_import); | |
125 | ||
126 | #define UUID_STR "_UUID" | |
9dd7d427 SB |
127 | static void deuuidify(char *uuid, const char *prefix, char **uuid_start, |
128 | int *uuid_len) | |
d7e09d03 PT |
129 | { |
130 | *uuid_start = !prefix || strncmp(uuid, prefix, strlen(prefix)) | |
131 | ? uuid : uuid + strlen(prefix); | |
132 | ||
133 | *uuid_len = strlen(*uuid_start); | |
134 | ||
135 | if (*uuid_len < strlen(UUID_STR)) | |
136 | return; | |
137 | ||
138 | if (!strncmp(*uuid_start + *uuid_len - strlen(UUID_STR), | |
30c0aa39 | 139 | UUID_STR, strlen(UUID_STR))) |
d7e09d03 PT |
140 | *uuid_len -= strlen(UUID_STR); |
141 | } | |
d7e09d03 PT |
142 | |
143 | /** | |
144 | * Returns true if import was FULL, false if import was already not | |
145 | * connected. | |
146 | * @imp - import to be disconnected | |
147 | * @conn_cnt - connection count (epoch) of the request that timed out | |
148 | * and caused the disconnection. In some cases, multiple | |
149 | * inflight requests can fail to a single target (e.g. OST | |
150 | * bulk requests) and if one has already caused a reconnection | |
151 | * (increasing the import->conn_cnt) the older failure should | |
152 | * not also cause a reconnection. If zero it forces a reconnect. | |
153 | */ | |
154 | int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt) | |
155 | { | |
156 | int rc = 0; | |
157 | ||
158 | spin_lock(&imp->imp_lock); | |
159 | ||
160 | if (imp->imp_state == LUSTRE_IMP_FULL && | |
161 | (conn_cnt == 0 || conn_cnt == imp->imp_conn_cnt)) { | |
162 | char *target_start; | |
163 | int target_len; | |
164 | ||
165 | deuuidify(obd2cli_tgt(imp->imp_obd), NULL, | |
166 | &target_start, &target_len); | |
167 | ||
168 | if (imp->imp_replayable) { | |
2d00bd17 JP |
169 | LCONSOLE_WARN("%s: Connection to %.*s (at %s) was lost; in progress operations using this service will wait for recovery to complete\n", |
170 | imp->imp_obd->obd_name, target_len, target_start, | |
171 | libcfs_nid2str(imp->imp_connection->c_peer.nid)); | |
d7e09d03 | 172 | } else { |
2d00bd17 JP |
173 | LCONSOLE_ERROR_MSG(0x166, "%s: Connection to %.*s (at %s) was lost; in progress operations using this service will fail\n", |
174 | imp->imp_obd->obd_name, | |
175 | target_len, target_start, | |
176 | libcfs_nid2str(imp->imp_connection->c_peer.nid)); | |
d7e09d03 | 177 | } |
d7e09d03 PT |
178 | IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON); |
179 | spin_unlock(&imp->imp_lock); | |
180 | ||
181 | if (obd_dump_on_timeout) | |
182 | libcfs_debug_dumplog(); | |
183 | ||
184 | obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON); | |
185 | rc = 1; | |
186 | } else { | |
187 | spin_unlock(&imp->imp_lock); | |
188 | CDEBUG(D_HA, "%s: import %p already %s (conn %u, was %u): %s\n", | |
189 | imp->imp_client->cli_name, imp, | |
190 | (imp->imp_state == LUSTRE_IMP_FULL && | |
191 | imp->imp_conn_cnt > conn_cnt) ? | |
192 | "reconnected" : "not connected", imp->imp_conn_cnt, | |
193 | conn_cnt, ptlrpc_import_state_name(imp->imp_state)); | |
194 | } | |
195 | ||
196 | return rc; | |
197 | } | |
198 | ||
441fda84 ML |
199 | /* |
200 | * This acts as a barrier; all existing requests are rejected, and | |
201 | * no new requests will be accepted until the import is valid again. | |
202 | */ | |
203 | void ptlrpc_deactivate_import(struct obd_import *imp) | |
d7e09d03 | 204 | { |
d7e09d03 | 205 | CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd)); |
441fda84 ML |
206 | |
207 | spin_lock(&imp->imp_lock); | |
d7e09d03 PT |
208 | imp->imp_invalid = 1; |
209 | imp->imp_generation++; | |
210 | spin_unlock(&imp->imp_lock); | |
211 | ||
212 | ptlrpc_abort_inflight(imp); | |
213 | obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE); | |
d7e09d03 | 214 | } |
d7e09d03 PT |
215 | EXPORT_SYMBOL(ptlrpc_deactivate_import); |
216 | ||
217 | static unsigned int | |
219e6de6 | 218 | ptlrpc_inflight_deadline(struct ptlrpc_request *req, time64_t now) |
d7e09d03 PT |
219 | { |
220 | long dl; | |
221 | ||
222 | if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) || | |
223 | (req->rq_phase == RQ_PHASE_BULK) || | |
224 | (req->rq_phase == RQ_PHASE_NEW))) | |
225 | return 0; | |
226 | ||
227 | if (req->rq_timedout) | |
228 | return 0; | |
229 | ||
230 | if (req->rq_phase == RQ_PHASE_NEW) | |
231 | dl = req->rq_sent; | |
232 | else | |
233 | dl = req->rq_deadline; | |
234 | ||
235 | if (dl <= now) | |
236 | return 0; | |
237 | ||
238 | return dl - now; | |
239 | } | |
240 | ||
241 | static unsigned int ptlrpc_inflight_timeout(struct obd_import *imp) | |
242 | { | |
219e6de6 | 243 | time64_t now = ktime_get_real_seconds(); |
d7e09d03 PT |
244 | struct list_head *tmp, *n; |
245 | struct ptlrpc_request *req; | |
246 | unsigned int timeout = 0; | |
247 | ||
248 | spin_lock(&imp->imp_lock); | |
249 | list_for_each_safe(tmp, n, &imp->imp_sending_list) { | |
250 | req = list_entry(tmp, struct ptlrpc_request, rq_list); | |
251 | timeout = max(ptlrpc_inflight_deadline(req, now), timeout); | |
252 | } | |
253 | spin_unlock(&imp->imp_lock); | |
254 | return timeout; | |
255 | } | |
256 | ||
257 | /** | |
258 | * This function will invalidate the import, if necessary, then block | |
259 | * for all the RPC completions, and finally notify the obd to | |
260 | * invalidate its state (ie cancel locks, clear pending requests, | |
261 | * etc). | |
262 | */ | |
263 | void ptlrpc_invalidate_import(struct obd_import *imp) | |
264 | { | |
265 | struct list_head *tmp, *n; | |
266 | struct ptlrpc_request *req; | |
267 | struct l_wait_info lwi; | |
268 | unsigned int timeout; | |
269 | int rc; | |
270 | ||
271 | atomic_inc(&imp->imp_inval_count); | |
272 | ||
273 | if (!imp->imp_invalid || imp->imp_obd->obd_no_recov) | |
274 | ptlrpc_deactivate_import(imp); | |
275 | ||
cca8fca1 | 276 | CFS_FAIL_TIMEOUT(OBD_FAIL_MGS_CONNECT_NET, 3 * cfs_fail_val / 2); |
d7e09d03 PT |
277 | LASSERT(imp->imp_invalid); |
278 | ||
279 | /* Wait forever until inflight == 0. We really can't do it another | |
280 | * way because in some cases we need to wait for very long reply | |
281 | * unlink. We can't do anything before that because there is really | |
dadfcdab OD |
282 | * no guarantee that some rdma transfer is not in progress right now. |
283 | */ | |
d7e09d03 PT |
284 | do { |
285 | /* Calculate max timeout for waiting on rpcs to error | |
286 | * out. Use obd_timeout if calculated value is smaller | |
dadfcdab OD |
287 | * than it. |
288 | */ | |
d7e09d03 PT |
289 | if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) { |
290 | timeout = ptlrpc_inflight_timeout(imp); | |
291 | timeout += timeout / 3; | |
292 | ||
293 | if (timeout == 0) | |
294 | timeout = obd_timeout; | |
295 | } else { | |
296 | /* decrease the interval to increase race condition */ | |
297 | timeout = 1; | |
298 | } | |
299 | ||
1d8cb70c GD |
300 | CDEBUG(D_RPCTRACE, |
301 | "Sleeping %d sec for inflight to error out\n", | |
d7e09d03 PT |
302 | timeout); |
303 | ||
304 | /* Wait for all requests to error out and call completion | |
305 | * callbacks. Cap it at obd_timeout -- these should all | |
dadfcdab OD |
306 | * have been locally cancelled by ptlrpc_abort_inflight. |
307 | */ | |
d7e09d03 PT |
308 | lwi = LWI_TIMEOUT_INTERVAL( |
309 | cfs_timeout_cap(cfs_time_seconds(timeout)), | |
cd94f231 OD |
310 | (timeout > 1) ? cfs_time_seconds(1) : |
311 | cfs_time_seconds(1) / 2, | |
d7e09d03 PT |
312 | NULL, NULL); |
313 | rc = l_wait_event(imp->imp_recovery_waitq, | |
314 | (atomic_read(&imp->imp_inflight) == 0), | |
315 | &lwi); | |
316 | if (rc) { | |
317 | const char *cli_tgt = obd2cli_tgt(imp->imp_obd); | |
318 | ||
319 | CERROR("%s: rc = %d waiting for callback (%d != 0)\n", | |
320 | cli_tgt, rc, | |
321 | atomic_read(&imp->imp_inflight)); | |
322 | ||
323 | spin_lock(&imp->imp_lock); | |
324 | if (atomic_read(&imp->imp_inflight) == 0) { | |
325 | int count = atomic_read(&imp->imp_unregistering); | |
326 | ||
327 | /* We know that "unregistering" rpcs only can | |
328 | * survive in sending or delaying lists (they | |
329 | * maybe waiting for long reply unlink in | |
330 | * sluggish nets). Let's check this. If there | |
331 | * is no inflight and unregistering != 0, this | |
dadfcdab OD |
332 | * is bug. |
333 | */ | |
2d00bd17 JP |
334 | LASSERTF(count == 0, "Some RPCs are still unregistering: %d\n", |
335 | count); | |
d7e09d03 PT |
336 | |
337 | /* Let's save one loop as soon as inflight have | |
338 | * dropped to zero. No new inflights possible at | |
dadfcdab OD |
339 | * this point. |
340 | */ | |
d7e09d03 PT |
341 | rc = 0; |
342 | } else { | |
343 | list_for_each_safe(tmp, n, | |
30c0aa39 | 344 | &imp->imp_sending_list) { |
d7e09d03 | 345 | req = list_entry(tmp, |
30c0aa39 OD |
346 | struct ptlrpc_request, |
347 | rq_list); | |
d7e09d03 PT |
348 | DEBUG_REQ(D_ERROR, req, |
349 | "still on sending list"); | |
350 | } | |
351 | list_for_each_safe(tmp, n, | |
30c0aa39 | 352 | &imp->imp_delayed_list) { |
d7e09d03 | 353 | req = list_entry(tmp, |
30c0aa39 OD |
354 | struct ptlrpc_request, |
355 | rq_list); | |
d7e09d03 PT |
356 | DEBUG_REQ(D_ERROR, req, |
357 | "still on delayed list"); | |
358 | } | |
359 | ||
81ea39ec | 360 | CERROR("%s: Unregistering RPCs found (%d). Network is sluggish? Waiting them to error out.\n", |
2d00bd17 | 361 | cli_tgt, |
d7e09d03 | 362 | atomic_read(&imp-> |
2d00bd17 | 363 | imp_unregistering)); |
d7e09d03 PT |
364 | } |
365 | spin_unlock(&imp->imp_lock); | |
d0bfef31 | 366 | } |
d7e09d03 PT |
367 | } while (rc != 0); |
368 | ||
369 | /* | |
370 | * Let's additionally check that no new rpcs added to import in | |
371 | * "invalidate" state. | |
372 | */ | |
373 | LASSERT(atomic_read(&imp->imp_inflight) == 0); | |
374 | obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE); | |
375 | sptlrpc_import_flush_all_ctx(imp); | |
376 | ||
377 | atomic_dec(&imp->imp_inval_count); | |
378 | wake_up_all(&imp->imp_recovery_waitq); | |
379 | } | |
380 | EXPORT_SYMBOL(ptlrpc_invalidate_import); | |
381 | ||
382 | /* unset imp_invalid */ | |
383 | void ptlrpc_activate_import(struct obd_import *imp) | |
384 | { | |
385 | struct obd_device *obd = imp->imp_obd; | |
386 | ||
387 | spin_lock(&imp->imp_lock); | |
0b291b9a HZ |
388 | if (imp->imp_deactive != 0) { |
389 | spin_unlock(&imp->imp_lock); | |
390 | return; | |
391 | } | |
392 | ||
d7e09d03 | 393 | imp->imp_invalid = 0; |
d7e09d03 PT |
394 | spin_unlock(&imp->imp_lock); |
395 | obd_import_event(obd, imp, IMP_EVENT_ACTIVE); | |
396 | } | |
397 | EXPORT_SYMBOL(ptlrpc_activate_import); | |
398 | ||
cca8fca1 AS |
399 | static void ptlrpc_pinger_force(struct obd_import *imp) |
400 | { | |
401 | CDEBUG(D_HA, "%s: waking up pinger s:%s\n", obd2cli_tgt(imp->imp_obd), | |
402 | ptlrpc_import_state_name(imp->imp_state)); | |
403 | ||
404 | spin_lock(&imp->imp_lock); | |
405 | imp->imp_force_verify = 1; | |
406 | spin_unlock(&imp->imp_lock); | |
407 | ||
408 | if (imp->imp_state != LUSTRE_IMP_CONNECTING) | |
409 | ptlrpc_pinger_wake_up(); | |
410 | } | |
411 | ||
d7e09d03 PT |
412 | void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt) |
413 | { | |
d7e09d03 PT |
414 | LASSERT(!imp->imp_dlm_fake); |
415 | ||
416 | if (ptlrpc_set_import_discon(imp, conn_cnt)) { | |
417 | if (!imp->imp_replayable) { | |
2d00bd17 | 418 | CDEBUG(D_HA, "import %s@%s for %s not replayable, auto-deactivating\n", |
d7e09d03 PT |
419 | obd2cli_tgt(imp->imp_obd), |
420 | imp->imp_connection->c_remote_uuid.uuid, | |
421 | imp->imp_obd->obd_name); | |
422 | ptlrpc_deactivate_import(imp); | |
423 | } | |
424 | ||
cca8fca1 | 425 | ptlrpc_pinger_force(imp); |
d7e09d03 | 426 | } |
d7e09d03 PT |
427 | } |
428 | EXPORT_SYMBOL(ptlrpc_fail_import); | |
429 | ||
430 | int ptlrpc_reconnect_import(struct obd_import *imp) | |
431 | { | |
cca8fca1 AS |
432 | struct l_wait_info lwi; |
433 | int secs = cfs_time_seconds(obd_timeout); | |
434 | int rc; | |
435 | ||
436 | ptlrpc_pinger_force(imp); | |
437 | ||
438 | CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n", | |
439 | obd2cli_tgt(imp->imp_obd), secs); | |
440 | ||
441 | lwi = LWI_TIMEOUT(secs, NULL, NULL); | |
442 | rc = l_wait_event(imp->imp_recovery_waitq, | |
443 | !ptlrpc_import_in_recovery(imp), &lwi); | |
444 | CDEBUG(D_HA, "%s: recovery finished s:%s\n", obd2cli_tgt(imp->imp_obd), | |
445 | ptlrpc_import_state_name(imp->imp_state)); | |
446 | return rc; | |
d7e09d03 PT |
447 | } |
448 | EXPORT_SYMBOL(ptlrpc_reconnect_import); | |
449 | ||
450 | /** | |
451 | * Connection on import \a imp is changed to another one (if more than one is | |
452 | * present). We typically chose connection that we have not tried to connect to | |
453 | * the longest | |
454 | */ | |
455 | static int import_select_connection(struct obd_import *imp) | |
456 | { | |
457 | struct obd_import_conn *imp_conn = NULL, *conn; | |
458 | struct obd_export *dlmexp; | |
459 | char *target_start; | |
460 | int target_len, tried_all = 1; | |
d7e09d03 PT |
461 | |
462 | spin_lock(&imp->imp_lock); | |
463 | ||
464 | if (list_empty(&imp->imp_conn_list)) { | |
465 | CERROR("%s: no connections available\n", | |
466 | imp->imp_obd->obd_name); | |
467 | spin_unlock(&imp->imp_lock); | |
0a3bdb00 | 468 | return -EINVAL; |
d7e09d03 PT |
469 | } |
470 | ||
471 | list_for_each_entry(conn, &imp->imp_conn_list, oic_item) { | |
b0f5aad5 | 472 | CDEBUG(D_HA, "%s: connect to NID %s last attempt %llu\n", |
d7e09d03 PT |
473 | imp->imp_obd->obd_name, |
474 | libcfs_nid2str(conn->oic_conn->c_peer.nid), | |
475 | conn->oic_last_attempt); | |
476 | ||
477 | /* If we have not tried this connection since | |
dadfcdab OD |
478 | * the last successful attempt, go with this one |
479 | */ | |
d7e09d03 PT |
480 | if ((conn->oic_last_attempt == 0) || |
481 | cfs_time_beforeq_64(conn->oic_last_attempt, | |
30c0aa39 | 482 | imp->imp_last_success_conn)) { |
d7e09d03 PT |
483 | imp_conn = conn; |
484 | tried_all = 0; | |
485 | break; | |
486 | } | |
487 | ||
488 | /* If all of the connections have already been tried | |
dadfcdab OD |
489 | * since the last successful connection; just choose the |
490 | * least recently used | |
491 | */ | |
d7e09d03 PT |
492 | if (!imp_conn) |
493 | imp_conn = conn; | |
494 | else if (cfs_time_before_64(conn->oic_last_attempt, | |
495 | imp_conn->oic_last_attempt)) | |
496 | imp_conn = conn; | |
497 | } | |
498 | ||
499 | /* if not found, simply choose the current one */ | |
500 | if (!imp_conn || imp->imp_force_reconnect) { | |
501 | LASSERT(imp->imp_conn_current); | |
502 | imp_conn = imp->imp_conn_current; | |
503 | tried_all = 0; | |
504 | } | |
505 | LASSERT(imp_conn->oic_conn); | |
506 | ||
507 | /* If we've tried everything, and we're back to the beginning of the | |
dadfcdab OD |
508 | * list, increase our timeout and try again. It will be reset when |
509 | * we do finally connect. (FIXME: really we should wait for all network | |
510 | * state associated with the last connection attempt to drain before | |
511 | * trying to reconnect on it.) | |
512 | */ | |
d7e09d03 PT |
513 | if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) { |
514 | struct adaptive_timeout *at = &imp->imp_at.iat_net_latency; | |
50ffcb7e | 515 | |
d7e09d03 PT |
516 | if (at_get(at) < CONNECTION_SWITCH_MAX) { |
517 | at_measured(at, at_get(at) + CONNECTION_SWITCH_INC); | |
518 | if (at_get(at) > CONNECTION_SWITCH_MAX) | |
519 | at_reset(at, CONNECTION_SWITCH_MAX); | |
520 | } | |
521 | LASSERT(imp_conn->oic_last_attempt); | |
2d00bd17 JP |
522 | CDEBUG(D_HA, "%s: tried all connections, increasing latency to %ds\n", |
523 | imp->imp_obd->obd_name, at_get(at)); | |
d7e09d03 PT |
524 | } |
525 | ||
526 | imp_conn->oic_last_attempt = cfs_time_current_64(); | |
527 | ||
528 | /* switch connection, don't mind if it's same as the current one */ | |
a5cb8880 | 529 | ptlrpc_connection_put(imp->imp_connection); |
d7e09d03 PT |
530 | imp->imp_connection = ptlrpc_connection_addref(imp_conn->oic_conn); |
531 | ||
d0bfef31 | 532 | dlmexp = class_conn2export(&imp->imp_dlm_handle); |
a5cb8880 | 533 | ptlrpc_connection_put(dlmexp->exp_connection); |
d7e09d03 PT |
534 | dlmexp->exp_connection = ptlrpc_connection_addref(imp_conn->oic_conn); |
535 | class_export_put(dlmexp); | |
536 | ||
537 | if (imp->imp_conn_current != imp_conn) { | |
538 | if (imp->imp_conn_current) { | |
539 | deuuidify(obd2cli_tgt(imp->imp_obd), NULL, | |
540 | &target_start, &target_len); | |
541 | ||
2d00bd17 | 542 | CDEBUG(D_HA, "%s: Connection changing to %.*s (at %s)\n", |
d7e09d03 PT |
543 | imp->imp_obd->obd_name, |
544 | target_len, target_start, | |
545 | libcfs_nid2str(imp_conn->oic_conn->c_peer.nid)); | |
546 | } | |
547 | ||
548 | imp->imp_conn_current = imp_conn; | |
549 | } | |
550 | ||
551 | CDEBUG(D_HA, "%s: import %p using connection %s/%s\n", | |
552 | imp->imp_obd->obd_name, imp, imp_conn->oic_uuid.uuid, | |
553 | libcfs_nid2str(imp_conn->oic_conn->c_peer.nid)); | |
554 | ||
555 | spin_unlock(&imp->imp_lock); | |
556 | ||
0a3bdb00 | 557 | return 0; |
d7e09d03 PT |
558 | } |
559 | ||
560 | /* | |
561 | * must be called under imp_lock | |
562 | */ | |
563 | static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno) | |
564 | { | |
565 | struct ptlrpc_request *req; | |
566 | struct list_head *tmp; | |
567 | ||
63d42578 | 568 | /* The requests in committed_list always have smaller transnos than |
dadfcdab OD |
569 | * the requests in replay_list |
570 | */ | |
63d42578 HZ |
571 | if (!list_empty(&imp->imp_committed_list)) { |
572 | tmp = imp->imp_committed_list.next; | |
573 | req = list_entry(tmp, struct ptlrpc_request, rq_replay_list); | |
574 | *transno = req->rq_transno; | |
575 | if (req->rq_transno == 0) { | |
576 | DEBUG_REQ(D_ERROR, req, | |
577 | "zero transno in committed_list"); | |
578 | LBUG(); | |
579 | } | |
580 | return 1; | |
d7e09d03 | 581 | } |
63d42578 HZ |
582 | if (!list_empty(&imp->imp_replay_list)) { |
583 | tmp = imp->imp_replay_list.next; | |
584 | req = list_entry(tmp, struct ptlrpc_request, rq_replay_list); | |
585 | *transno = req->rq_transno; | |
586 | if (req->rq_transno == 0) { | |
587 | DEBUG_REQ(D_ERROR, req, "zero transno in replay_list"); | |
588 | LBUG(); | |
589 | } | |
590 | return 1; | |
591 | } | |
592 | return 0; | |
d7e09d03 PT |
593 | } |
594 | ||
595 | /** | |
596 | * Attempt to (re)connect import \a imp. This includes all preparations, | |
597 | * initializing CONNECT RPC request and passing it to ptlrpcd for | |
598 | * actual sending. | |
599 | * Returns 0 on success or error code. | |
600 | */ | |
601 | int ptlrpc_connect_import(struct obd_import *imp) | |
602 | { | |
603 | struct obd_device *obd = imp->imp_obd; | |
604 | int initial_connect = 0; | |
605 | int set_transno = 0; | |
606 | __u64 committed_before_reconnect = 0; | |
607 | struct ptlrpc_request *request; | |
608 | char *bufs[] = { NULL, | |
609 | obd2cli_tgt(imp->imp_obd), | |
610 | obd->obd_uuid.uuid, | |
611 | (char *)&imp->imp_dlm_handle, | |
612 | (char *)&imp->imp_connect_data }; | |
613 | struct ptlrpc_connect_async_args *aa; | |
614 | int rc; | |
d7e09d03 PT |
615 | |
616 | spin_lock(&imp->imp_lock); | |
617 | if (imp->imp_state == LUSTRE_IMP_CLOSED) { | |
618 | spin_unlock(&imp->imp_lock); | |
619 | CERROR("can't connect to a closed import\n"); | |
0a3bdb00 | 620 | return -EINVAL; |
d7e09d03 PT |
621 | } else if (imp->imp_state == LUSTRE_IMP_FULL) { |
622 | spin_unlock(&imp->imp_lock); | |
623 | CERROR("already connected\n"); | |
0a3bdb00 | 624 | return 0; |
d7e09d03 PT |
625 | } else if (imp->imp_state == LUSTRE_IMP_CONNECTING) { |
626 | spin_unlock(&imp->imp_lock); | |
627 | CERROR("already connecting\n"); | |
0a3bdb00 | 628 | return -EALREADY; |
d7e09d03 PT |
629 | } |
630 | ||
631 | IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CONNECTING); | |
632 | ||
633 | imp->imp_conn_cnt++; | |
634 | imp->imp_resend_replay = 0; | |
635 | ||
636 | if (!lustre_handle_is_used(&imp->imp_remote_handle)) | |
637 | initial_connect = 1; | |
638 | else | |
639 | committed_before_reconnect = imp->imp_peer_committed_transno; | |
640 | ||
641 | set_transno = ptlrpc_first_transno(imp, | |
642 | &imp->imp_connect_data.ocd_transno); | |
643 | spin_unlock(&imp->imp_lock); | |
644 | ||
645 | rc = import_select_connection(imp); | |
646 | if (rc) | |
a9b3e8f3 | 647 | goto out; |
d7e09d03 | 648 | |
5bcfab13 | 649 | rc = sptlrpc_import_sec_adapt(imp, NULL, NULL); |
d7e09d03 | 650 | if (rc) |
a9b3e8f3 | 651 | goto out; |
d7e09d03 PT |
652 | |
653 | /* Reset connect flags to the originally requested flags, in case | |
dadfcdab OD |
654 | * the server is updated on-the-fly we will get the new features. |
655 | */ | |
d7e09d03 PT |
656 | imp->imp_connect_data.ocd_connect_flags = imp->imp_connect_flags_orig; |
657 | /* Reset ocd_version each time so the server knows the exact versions */ | |
658 | imp->imp_connect_data.ocd_version = LUSTRE_VERSION_CODE; | |
659 | imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT; | |
660 | imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18; | |
661 | ||
662 | rc = obd_reconnect(NULL, imp->imp_obd->obd_self_export, obd, | |
663 | &obd->obd_uuid, &imp->imp_connect_data, NULL); | |
664 | if (rc) | |
a9b3e8f3 | 665 | goto out; |
d7e09d03 PT |
666 | |
667 | request = ptlrpc_request_alloc(imp, &RQF_MDS_CONNECT); | |
8b382089 | 668 | if (!request) { |
a9b3e8f3 JL |
669 | rc = -ENOMEM; |
670 | goto out; | |
671 | } | |
d7e09d03 PT |
672 | |
673 | rc = ptlrpc_request_bufs_pack(request, LUSTRE_OBD_VERSION, | |
674 | imp->imp_connect_op, bufs, NULL); | |
675 | if (rc) { | |
676 | ptlrpc_request_free(request); | |
a9b3e8f3 | 677 | goto out; |
d7e09d03 PT |
678 | } |
679 | ||
680 | /* Report the rpc service time to the server so that it knows how long | |
dadfcdab OD |
681 | * to wait for clients to join recovery |
682 | */ | |
d7e09d03 PT |
683 | lustre_msg_set_service_time(request->rq_reqmsg, |
684 | at_timeout2est(request->rq_timeout)); | |
685 | ||
686 | /* The amount of time we give the server to process the connect req. | |
687 | * import_select_connection will increase the net latency on | |
688 | * repeated reconnect attempts to cover slow networks. | |
689 | * We override/ignore the server rpc completion estimate here, | |
dadfcdab OD |
690 | * which may be large if this is a reconnect attempt |
691 | */ | |
d7e09d03 PT |
692 | request->rq_timeout = INITIAL_CONNECT_TIMEOUT; |
693 | lustre_msg_set_timeout(request->rq_reqmsg, request->rq_timeout); | |
694 | ||
695 | lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_NEXT_VER); | |
696 | ||
3d2b8f57 NC |
697 | request->rq_no_resend = 1; |
698 | request->rq_no_delay = 1; | |
d7e09d03 PT |
699 | request->rq_send_state = LUSTRE_IMP_CONNECTING; |
700 | /* Allow a slightly larger reply for future growth compatibility */ | |
701 | req_capsule_set_size(&request->rq_pill, &RMF_CONNECT_DATA, RCL_SERVER, | |
cd94f231 OD |
702 | sizeof(struct obd_connect_data) + |
703 | 16 * sizeof(__u64)); | |
d7e09d03 PT |
704 | ptlrpc_request_set_replen(request); |
705 | request->rq_interpret_reply = ptlrpc_connect_interpret; | |
706 | ||
3949015e | 707 | CLASSERT(sizeof(*aa) <= sizeof(request->rq_async_args)); |
d7e09d03 | 708 | aa = ptlrpc_req_async_args(request); |
ec83e611 | 709 | memset(aa, 0, sizeof(*aa)); |
d7e09d03 PT |
710 | |
711 | aa->pcaa_peer_committed = committed_before_reconnect; | |
712 | aa->pcaa_initial_connect = initial_connect; | |
713 | ||
714 | if (aa->pcaa_initial_connect) { | |
715 | spin_lock(&imp->imp_lock); | |
716 | imp->imp_replayable = 1; | |
717 | spin_unlock(&imp->imp_lock); | |
718 | lustre_msg_add_op_flags(request->rq_reqmsg, | |
719 | MSG_CONNECT_INITIAL); | |
720 | } | |
721 | ||
722 | if (set_transno) | |
723 | lustre_msg_add_op_flags(request->rq_reqmsg, | |
724 | MSG_CONNECT_TRANSNO); | |
725 | ||
726 | DEBUG_REQ(D_RPCTRACE, request, "(re)connect request (timeout %d)", | |
727 | request->rq_timeout); | |
c5c4c6fa | 728 | ptlrpcd_add_req(request); |
d7e09d03 PT |
729 | rc = 0; |
730 | out: | |
c5c4c6fa | 731 | if (rc != 0) |
d7e09d03 | 732 | IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON); |
d7e09d03 | 733 | |
0a3bdb00 | 734 | return rc; |
d7e09d03 PT |
735 | } |
736 | EXPORT_SYMBOL(ptlrpc_connect_import); | |
737 | ||
738 | static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp) | |
739 | { | |
740 | int force_verify; | |
741 | ||
742 | spin_lock(&imp->imp_lock); | |
743 | force_verify = imp->imp_force_verify != 0; | |
744 | spin_unlock(&imp->imp_lock); | |
745 | ||
746 | if (force_verify) | |
747 | ptlrpc_pinger_wake_up(); | |
748 | } | |
749 | ||
750 | static int ptlrpc_busy_reconnect(int rc) | |
751 | { | |
752 | return (rc == -EBUSY) || (rc == -EAGAIN); | |
753 | } | |
754 | ||
755 | /** | |
756 | * interpret_reply callback for connect RPCs. | |
757 | * Looks into returned status of connect operation and decides | |
758 | * what to do with the import - i.e enter recovery, promote it to | |
759 | * full state for normal operations of disconnect it due to an error. | |
760 | */ | |
761 | static int ptlrpc_connect_interpret(const struct lu_env *env, | |
762 | struct ptlrpc_request *request, | |
763 | void *data, int rc) | |
764 | { | |
765 | struct ptlrpc_connect_async_args *aa = data; | |
766 | struct obd_import *imp = request->rq_import; | |
767 | struct client_obd *cli = &imp->imp_obd->u.cli; | |
768 | struct lustre_handle old_hdl; | |
769 | __u64 old_connect_flags; | |
770 | int msg_flags; | |
771 | struct obd_connect_data *ocd; | |
772 | struct obd_export *exp; | |
773 | int ret; | |
d7e09d03 PT |
774 | |
775 | spin_lock(&imp->imp_lock); | |
776 | if (imp->imp_state == LUSTRE_IMP_CLOSED) { | |
777 | imp->imp_connect_tried = 1; | |
778 | spin_unlock(&imp->imp_lock); | |
0a3bdb00 | 779 | return 0; |
d7e09d03 PT |
780 | } |
781 | ||
782 | if (rc) { | |
783 | /* if this reconnect to busy export - not need select new target | |
dadfcdab OD |
784 | * for connecting |
785 | */ | |
d7e09d03 PT |
786 | imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc); |
787 | spin_unlock(&imp->imp_lock); | |
788 | ptlrpc_maybe_ping_import_soon(imp); | |
a9b3e8f3 | 789 | goto out; |
d7e09d03 PT |
790 | } |
791 | spin_unlock(&imp->imp_lock); | |
792 | ||
793 | LASSERT(imp->imp_conn_current); | |
794 | ||
795 | msg_flags = lustre_msg_get_op_flags(request->rq_repmsg); | |
796 | ||
797 | ret = req_capsule_get_size(&request->rq_pill, &RMF_CONNECT_DATA, | |
798 | RCL_SERVER); | |
799 | /* server replied obd_connect_data is always bigger */ | |
800 | ocd = req_capsule_server_sized_get(&request->rq_pill, | |
801 | &RMF_CONNECT_DATA, ret); | |
802 | ||
8b382089 | 803 | if (!ocd) { |
d7e09d03 PT |
804 | CERROR("%s: no connect data from server\n", |
805 | imp->imp_obd->obd_name); | |
806 | rc = -EPROTO; | |
a9b3e8f3 | 807 | goto out; |
d7e09d03 PT |
808 | } |
809 | ||
810 | spin_lock(&imp->imp_lock); | |
811 | ||
812 | /* All imports are pingable */ | |
813 | imp->imp_pingable = 1; | |
814 | imp->imp_force_reconnect = 0; | |
815 | imp->imp_force_verify = 0; | |
816 | ||
817 | imp->imp_connect_data = *ocd; | |
818 | ||
819 | CDEBUG(D_HA, "%s: connect to target with instance %u\n", | |
820 | imp->imp_obd->obd_name, ocd->ocd_instance); | |
821 | exp = class_conn2export(&imp->imp_dlm_handle); | |
822 | ||
823 | spin_unlock(&imp->imp_lock); | |
824 | ||
825 | /* check that server granted subset of flags we asked for. */ | |
826 | if ((ocd->ocd_connect_flags & imp->imp_connect_flags_orig) != | |
827 | ocd->ocd_connect_flags) { | |
55f5a824 | 828 | CERROR("%s: Server didn't granted asked subset of flags: asked=%#llx grranted=%#llx\n", |
1d8cb70c | 829 | imp->imp_obd->obd_name, imp->imp_connect_flags_orig, |
d7e09d03 | 830 | ocd->ocd_connect_flags); |
a9b3e8f3 JL |
831 | rc = -EPROTO; |
832 | goto out; | |
d7e09d03 PT |
833 | } |
834 | ||
835 | if (!exp) { | |
836 | /* This could happen if export is cleaned during the | |
dadfcdab OD |
837 | * connect attempt |
838 | */ | |
d7e09d03 PT |
839 | CERROR("%s: missing export after connect\n", |
840 | imp->imp_obd->obd_name); | |
a9b3e8f3 JL |
841 | rc = -ENODEV; |
842 | goto out; | |
d7e09d03 PT |
843 | } |
844 | old_connect_flags = exp_connect_flags(exp); | |
845 | exp->exp_connect_data = *ocd; | |
846 | imp->imp_obd->obd_self_export->exp_connect_data = *ocd; | |
847 | class_export_put(exp); | |
848 | ||
849 | obd_import_event(imp->imp_obd, imp, IMP_EVENT_OCD); | |
850 | ||
851 | if (aa->pcaa_initial_connect) { | |
852 | spin_lock(&imp->imp_lock); | |
853 | if (msg_flags & MSG_CONNECT_REPLAYABLE) { | |
854 | imp->imp_replayable = 1; | |
855 | spin_unlock(&imp->imp_lock); | |
856 | CDEBUG(D_HA, "connected to replayable target: %s\n", | |
857 | obd2cli_tgt(imp->imp_obd)); | |
858 | } else { | |
859 | imp->imp_replayable = 0; | |
860 | spin_unlock(&imp->imp_lock); | |
861 | } | |
862 | ||
863 | /* if applies, adjust the imp->imp_msg_magic here | |
dadfcdab OD |
864 | * according to reply flags |
865 | */ | |
d7e09d03 PT |
866 | |
867 | imp->imp_remote_handle = | |
868 | *lustre_msg_get_handle(request->rq_repmsg); | |
869 | ||
870 | /* Initial connects are allowed for clients with non-random | |
871 | * uuids when servers are in recovery. Simply signal the | |
dadfcdab OD |
872 | * servers replay is complete and wait in REPLAY_WAIT. |
873 | */ | |
d7e09d03 PT |
874 | if (msg_flags & MSG_CONNECT_RECOVERING) { |
875 | CDEBUG(D_HA, "connect to %s during recovery\n", | |
876 | obd2cli_tgt(imp->imp_obd)); | |
877 | IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS); | |
878 | } else { | |
879 | IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL); | |
880 | ptlrpc_activate_import(imp); | |
881 | } | |
882 | ||
a9b3e8f3 JL |
883 | rc = 0; |
884 | goto finish; | |
d7e09d03 PT |
885 | } |
886 | ||
887 | /* Determine what recovery state to move the import to. */ | |
2b241d31 | 888 | if (msg_flags & MSG_CONNECT_RECONNECT) { |
d7e09d03 PT |
889 | memset(&old_hdl, 0, sizeof(old_hdl)); |
890 | if (!memcmp(&old_hdl, lustre_msg_get_handle(request->rq_repmsg), | |
3949015e | 891 | sizeof(old_hdl))) { |
55f5a824 | 892 | LCONSOLE_WARN("Reconnect to %s (at @%s) failed due bad handle %#llx\n", |
d7e09d03 PT |
893 | obd2cli_tgt(imp->imp_obd), |
894 | imp->imp_connection->c_remote_uuid.uuid, | |
895 | imp->imp_dlm_handle.cookie); | |
a9b3e8f3 JL |
896 | rc = -ENOTCONN; |
897 | goto out; | |
d7e09d03 PT |
898 | } |
899 | ||
900 | if (memcmp(&imp->imp_remote_handle, | |
901 | lustre_msg_get_handle(request->rq_repmsg), | |
902 | sizeof(imp->imp_remote_handle))) { | |
903 | int level = msg_flags & MSG_CONNECT_RECOVERING ? | |
904 | D_HA : D_WARNING; | |
905 | ||
906 | /* Bug 16611/14775: if server handle have changed, | |
907 | * that means some sort of disconnection happened. | |
908 | * If the server is not in recovery, that also means it | |
909 | * already erased all of our state because of previous | |
910 | * eviction. If it is in recovery - we are safe to | |
911 | * participate since we can reestablish all of our state | |
dadfcdab OD |
912 | * with server again |
913 | */ | |
2b241d31 | 914 | if ((msg_flags & MSG_CONNECT_RECOVERING)) { |
b533ff4b | 915 | CDEBUG(level, "%s@%s changed server handle from %#llx to %#llx but is still in recovery\n", |
d7e09d03 PT |
916 | obd2cli_tgt(imp->imp_obd), |
917 | imp->imp_connection->c_remote_uuid.uuid, | |
918 | imp->imp_remote_handle.cookie, | |
919 | lustre_msg_get_handle( | |
920 | request->rq_repmsg)->cookie); | |
921 | } else { | |
2d00bd17 | 922 | LCONSOLE_WARN("Evicted from %s (at %s) after server handle changed from %#llx to %#llx\n", |
d7e09d03 PT |
923 | obd2cli_tgt(imp->imp_obd), |
924 | imp->imp_connection-> \ | |
925 | c_remote_uuid.uuid, | |
926 | imp->imp_remote_handle.cookie, | |
927 | lustre_msg_get_handle( | |
2d00bd17 | 928 | request->rq_repmsg)->cookie); |
d7e09d03 PT |
929 | } |
930 | ||
d7e09d03 PT |
931 | imp->imp_remote_handle = |
932 | *lustre_msg_get_handle(request->rq_repmsg); | |
933 | ||
2b241d31 | 934 | if (!(msg_flags & MSG_CONNECT_RECOVERING)) { |
d7e09d03 | 935 | IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED); |
a9b3e8f3 JL |
936 | rc = 0; |
937 | goto finish; | |
d7e09d03 PT |
938 | } |
939 | ||
940 | } else { | |
941 | CDEBUG(D_HA, "reconnected to %s@%s after partition\n", | |
942 | obd2cli_tgt(imp->imp_obd), | |
943 | imp->imp_connection->c_remote_uuid.uuid); | |
944 | } | |
945 | ||
946 | if (imp->imp_invalid) { | |
2d00bd17 JP |
947 | CDEBUG(D_HA, "%s: reconnected but import is invalid; marking evicted\n", |
948 | imp->imp_obd->obd_name); | |
d7e09d03 | 949 | IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED); |
2b241d31 | 950 | } else if (msg_flags & MSG_CONNECT_RECOVERING) { |
d7e09d03 PT |
951 | CDEBUG(D_HA, "%s: reconnected to %s during replay\n", |
952 | imp->imp_obd->obd_name, | |
953 | obd2cli_tgt(imp->imp_obd)); | |
954 | ||
955 | spin_lock(&imp->imp_lock); | |
956 | imp->imp_resend_replay = 1; | |
957 | spin_unlock(&imp->imp_lock); | |
958 | ||
502cb58e | 959 | IMPORT_SET_STATE(imp, imp->imp_replay_state); |
d7e09d03 PT |
960 | } else { |
961 | IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER); | |
962 | } | |
2b241d31 | 963 | } else if ((msg_flags & MSG_CONNECT_RECOVERING) && !imp->imp_invalid) { |
d7e09d03 PT |
964 | LASSERT(imp->imp_replayable); |
965 | imp->imp_remote_handle = | |
966 | *lustre_msg_get_handle(request->rq_repmsg); | |
967 | imp->imp_last_replay_transno = 0; | |
968 | IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY); | |
969 | } else { | |
2d00bd17 JP |
970 | DEBUG_REQ(D_HA, request, "%s: evicting (reconnect/recover flags not set: %x)", |
971 | imp->imp_obd->obd_name, msg_flags); | |
d7e09d03 PT |
972 | imp->imp_remote_handle = |
973 | *lustre_msg_get_handle(request->rq_repmsg); | |
974 | IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED); | |
975 | } | |
976 | ||
977 | /* Sanity checks for a reconnected import. */ | |
978 | if (!(imp->imp_replayable) != !(msg_flags & MSG_CONNECT_REPLAYABLE)) { | |
2d00bd17 | 979 | CERROR("imp_replayable flag does not match server after reconnect. We should LBUG right here.\n"); |
d7e09d03 PT |
980 | } |
981 | ||
982 | if (lustre_msg_get_last_committed(request->rq_repmsg) > 0 && | |
983 | lustre_msg_get_last_committed(request->rq_repmsg) < | |
984 | aa->pcaa_peer_committed) { | |
2d00bd17 | 985 | CERROR("%s went back in time (transno %lld was previously committed, server now claims %lld)! See https://bugzilla.lustre.org/show_bug.cgi?id=9646\n", |
d7e09d03 PT |
986 | obd2cli_tgt(imp->imp_obd), aa->pcaa_peer_committed, |
987 | lustre_msg_get_last_committed(request->rq_repmsg)); | |
988 | } | |
989 | ||
990 | finish: | |
991 | rc = ptlrpc_import_recovery_state_machine(imp); | |
992 | if (rc != 0) { | |
993 | if (rc == -ENOTCONN) { | |
2d00bd17 | 994 | CDEBUG(D_HA, "evicted/aborted by %s@%s during recovery; invalidating and reconnecting\n", |
d7e09d03 PT |
995 | obd2cli_tgt(imp->imp_obd), |
996 | imp->imp_connection->c_remote_uuid.uuid); | |
997 | ptlrpc_connect_import(imp); | |
998 | imp->imp_connect_tried = 1; | |
0a3bdb00 | 999 | return 0; |
d7e09d03 PT |
1000 | } |
1001 | } else { | |
35e45816 AD |
1002 | static bool warned; |
1003 | ||
d7e09d03 PT |
1004 | spin_lock(&imp->imp_lock); |
1005 | list_del(&imp->imp_conn_current->oic_item); | |
30c0aa39 | 1006 | list_add(&imp->imp_conn_current->oic_item, &imp->imp_conn_list); |
d7e09d03 PT |
1007 | imp->imp_last_success_conn = |
1008 | imp->imp_conn_current->oic_last_attempt; | |
1009 | ||
1010 | spin_unlock(&imp->imp_lock); | |
1011 | ||
f261f48a FY |
1012 | if ((imp->imp_connect_flags_orig & OBD_CONNECT_IBITS) && |
1013 | !(ocd->ocd_connect_flags & OBD_CONNECT_IBITS)) { | |
2d00bd17 | 1014 | LCONSOLE_WARN("%s: MDS %s does not support ibits lock, either very old or invalid: requested %llx, replied %llx\n", |
f261f48a FY |
1015 | imp->imp_obd->obd_name, |
1016 | imp->imp_connection->c_remote_uuid.uuid, | |
1017 | imp->imp_connect_flags_orig, | |
1018 | ocd->ocd_connect_flags); | |
a9b3e8f3 JL |
1019 | rc = -EPROTO; |
1020 | goto out; | |
f261f48a | 1021 | } |
d7e09d03 | 1022 | |
35e45816 | 1023 | if (!warned && (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) && |
d7e09d03 PT |
1024 | (ocd->ocd_version > LUSTRE_VERSION_CODE + |
1025 | LUSTRE_VERSION_OFFSET_WARN || | |
1026 | ocd->ocd_version < LUSTRE_VERSION_CODE - | |
1027 | LUSTRE_VERSION_OFFSET_WARN)) { | |
1028 | /* Sigh, some compilers do not like #ifdef in the middle | |
dadfcdab OD |
1029 | * of macro arguments |
1030 | */ | |
35e45816 AD |
1031 | const char *older = "older than client. Consider upgrading server"; |
1032 | const char *newer = "newer than client. Consider recompiling application"; | |
d7e09d03 | 1033 | |
2d00bd17 | 1034 | LCONSOLE_WARN("Server %s version (%d.%d.%d.%d) is much %s (%s)\n", |
d7e09d03 PT |
1035 | obd2cli_tgt(imp->imp_obd), |
1036 | OBD_OCD_VERSION_MAJOR(ocd->ocd_version), | |
1037 | OBD_OCD_VERSION_MINOR(ocd->ocd_version), | |
1038 | OBD_OCD_VERSION_PATCH(ocd->ocd_version), | |
1039 | OBD_OCD_VERSION_FIX(ocd->ocd_version), | |
1040 | ocd->ocd_version > LUSTRE_VERSION_CODE ? | |
1041 | newer : older, LUSTRE_VERSION_STRING); | |
35e45816 | 1042 | warned = true; |
d7e09d03 PT |
1043 | } |
1044 | ||
1045 | #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0) | |
1046 | /* Check if server has LU-1252 fix applied to not always swab | |
1047 | * the IR MNE entries. Do this only once per connection. This | |
1048 | * fixup is version-limited, because we don't want to carry the | |
1049 | * OBD_CONNECT_MNE_SWAB flag around forever, just so long as we | |
1050 | * need interop with unpatched 2.2 servers. For newer servers, | |
dadfcdab OD |
1051 | * the client will do MNE swabbing only as needed. LU-1644 |
1052 | */ | |
d7e09d03 PT |
1053 | if (unlikely((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) && |
1054 | !(ocd->ocd_connect_flags & OBD_CONNECT_MNE_SWAB) && | |
1055 | OBD_OCD_VERSION_MAJOR(ocd->ocd_version) == 2 && | |
1056 | OBD_OCD_VERSION_MINOR(ocd->ocd_version) == 2 && | |
1057 | OBD_OCD_VERSION_PATCH(ocd->ocd_version) < 55 && | |
1058 | strcmp(imp->imp_obd->obd_type->typ_name, | |
1059 | LUSTRE_MGC_NAME) == 0)) | |
1060 | imp->imp_need_mne_swab = 1; | |
1061 | else /* clear if server was upgraded since last connect */ | |
1062 | imp->imp_need_mne_swab = 0; | |
1063 | #else | |
1064 | #warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab" | |
1065 | #endif | |
1066 | ||
1067 | if (ocd->ocd_connect_flags & OBD_CONNECT_CKSUM) { | |
1068 | /* We sent to the server ocd_cksum_types with bits set | |
1069 | * for algorithms we understand. The server masked off | |
dadfcdab OD |
1070 | * the checksum types it doesn't support |
1071 | */ | |
d7e09d03 PT |
1072 | if ((ocd->ocd_cksum_types & |
1073 | cksum_types_supported_client()) == 0) { | |
2d00bd17 | 1074 | LCONSOLE_WARN("The negotiation of the checksum algorithm to use with server %s failed (%x/%x), disabling checksums\n", |
d7e09d03 PT |
1075 | obd2cli_tgt(imp->imp_obd), |
1076 | ocd->ocd_cksum_types, | |
1077 | cksum_types_supported_client()); | |
1078 | cli->cl_checksum = 0; | |
1079 | cli->cl_supp_cksum_types = OBD_CKSUM_ADLER; | |
1080 | } else { | |
1081 | cli->cl_supp_cksum_types = ocd->ocd_cksum_types; | |
1082 | } | |
1083 | } else { | |
1084 | /* The server does not support OBD_CONNECT_CKSUM. | |
dadfcdab OD |
1085 | * Enforce ADLER for backward compatibility |
1086 | */ | |
d7e09d03 PT |
1087 | cli->cl_supp_cksum_types = OBD_CKSUM_ADLER; |
1088 | } | |
b533ff4b | 1089 | cli->cl_cksum_type = cksum_type_select(cli->cl_supp_cksum_types); |
d7e09d03 PT |
1090 | |
1091 | if (ocd->ocd_connect_flags & OBD_CONNECT_BRW_SIZE) | |
1092 | cli->cl_max_pages_per_rpc = | |
09cbfeaf | 1093 | min(ocd->ocd_brw_size >> PAGE_SHIFT, |
d7e09d03 PT |
1094 | cli->cl_max_pages_per_rpc); |
1095 | else if (imp->imp_connect_op == MDS_CONNECT || | |
1096 | imp->imp_connect_op == MGS_CONNECT) | |
1097 | cli->cl_max_pages_per_rpc = 1; | |
1098 | ||
1099 | /* Reset ns_connect_flags only for initial connect. It might be | |
1100 | * changed in while using FS and if we reset it in reconnect | |
1101 | * this leads to losing user settings done before such as | |
dadfcdab OD |
1102 | * disable lru_resize, etc. |
1103 | */ | |
d7e09d03 PT |
1104 | if (old_connect_flags != exp_connect_flags(exp) || |
1105 | aa->pcaa_initial_connect) { | |
55f5a824 GKH |
1106 | CDEBUG(D_HA, "%s: Resetting ns_connect_flags to server flags: %#llx\n", |
1107 | imp->imp_obd->obd_name, ocd->ocd_connect_flags); | |
d7e09d03 PT |
1108 | imp->imp_obd->obd_namespace->ns_connect_flags = |
1109 | ocd->ocd_connect_flags; | |
1110 | imp->imp_obd->obd_namespace->ns_orig_connect_flags = | |
1111 | ocd->ocd_connect_flags; | |
1112 | } | |
1113 | ||
1114 | if ((ocd->ocd_connect_flags & OBD_CONNECT_AT) && | |
1115 | (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2)) | |
1116 | /* We need a per-message support flag, because | |
dadfcdab OD |
1117 | * a. we don't know if the incoming connect reply |
1118 | * supports AT or not (in reply_in_callback) | |
1119 | * until we unpack it. | |
1120 | * b. failovered server means export and flags are gone | |
1121 | * (in ptlrpc_send_reply). | |
1122 | * Can only be set when we know AT is supported at | |
1123 | * both ends | |
1124 | */ | |
d7e09d03 PT |
1125 | imp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT; |
1126 | else | |
1127 | imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT; | |
1128 | ||
1129 | if ((ocd->ocd_connect_flags & OBD_CONNECT_FULL20) && | |
1130 | (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2)) | |
1131 | imp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18; | |
1132 | else | |
1133 | imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18; | |
1134 | ||
1135 | LASSERT((cli->cl_max_pages_per_rpc <= PTLRPC_MAX_BRW_PAGES) && | |
1136 | (cli->cl_max_pages_per_rpc > 0)); | |
3147b268 | 1137 | client_adjust_max_dirty(cli); |
d7e09d03 PT |
1138 | } |
1139 | ||
1140 | out: | |
1141 | imp->imp_connect_tried = 1; | |
1142 | ||
1143 | if (rc != 0) { | |
1144 | IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON); | |
1145 | if (rc == -EACCES) { | |
1146 | /* | |
1147 | * Give up trying to reconnect | |
1148 | * EACCES means client has no permission for connection | |
1149 | */ | |
1150 | imp->imp_obd->obd_no_recov = 1; | |
1151 | ptlrpc_deactivate_import(imp); | |
1152 | } | |
1153 | ||
1154 | if (rc == -EPROTO) { | |
1155 | struct obd_connect_data *ocd; | |
1156 | ||
1157 | /* reply message might not be ready */ | |
8b382089 | 1158 | if (!request->rq_repmsg) |
0a3bdb00 | 1159 | return -EPROTO; |
d7e09d03 PT |
1160 | |
1161 | ocd = req_capsule_server_get(&request->rq_pill, | |
1162 | &RMF_CONNECT_DATA); | |
1163 | if (ocd && | |
1164 | (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) && | |
1165 | (ocd->ocd_version != LUSTRE_VERSION_CODE)) { | |
532118c0 KM |
1166 | /* |
1167 | * Actually servers are only supposed to refuse | |
1168 | * connection from liblustre clients, so we | |
1169 | * should never see this from VFS context | |
1170 | */ | |
2d00bd17 JP |
1171 | LCONSOLE_ERROR_MSG(0x16a, "Server %s version (%d.%d.%d.%d) refused connection from this client with an incompatible version (%s). Client must be recompiled\n", |
1172 | obd2cli_tgt(imp->imp_obd), | |
1173 | OBD_OCD_VERSION_MAJOR(ocd->ocd_version), | |
1174 | OBD_OCD_VERSION_MINOR(ocd->ocd_version), | |
1175 | OBD_OCD_VERSION_PATCH(ocd->ocd_version), | |
1176 | OBD_OCD_VERSION_FIX(ocd->ocd_version), | |
1177 | LUSTRE_VERSION_STRING); | |
d7e09d03 PT |
1178 | ptlrpc_deactivate_import(imp); |
1179 | IMPORT_SET_STATE(imp, LUSTRE_IMP_CLOSED); | |
1180 | } | |
0a3bdb00 | 1181 | return -EPROTO; |
d7e09d03 PT |
1182 | } |
1183 | ||
1184 | ptlrpc_maybe_ping_import_soon(imp); | |
1185 | ||
1186 | CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n", | |
1187 | obd2cli_tgt(imp->imp_obd), | |
1188 | (char *)imp->imp_connection->c_remote_uuid.uuid, rc); | |
1189 | } | |
1190 | ||
1191 | wake_up_all(&imp->imp_recovery_waitq); | |
0a3bdb00 | 1192 | return rc; |
d7e09d03 PT |
1193 | } |
1194 | ||
1195 | /** | |
1196 | * interpret callback for "completed replay" RPCs. | |
1197 | * \see signal_completed_replay | |
1198 | */ | |
1199 | static int completed_replay_interpret(const struct lu_env *env, | |
1200 | struct ptlrpc_request *req, | |
aff9d8e8 | 1201 | void *data, int rc) |
d7e09d03 | 1202 | { |
d7e09d03 PT |
1203 | atomic_dec(&req->rq_import->imp_replay_inflight); |
1204 | if (req->rq_status == 0 && | |
1205 | !req->rq_import->imp_vbr_failed) { | |
1206 | ptlrpc_import_recovery_state_machine(req->rq_import); | |
1207 | } else { | |
1208 | if (req->rq_import->imp_vbr_failed) { | |
1209 | CDEBUG(D_WARNING, | |
1210 | "%s: version recovery fails, reconnecting\n", | |
1211 | req->rq_import->imp_obd->obd_name); | |
1212 | } else { | |
2d00bd17 | 1213 | CDEBUG(D_HA, "%s: LAST_REPLAY message error: %d, reconnecting\n", |
d7e09d03 PT |
1214 | req->rq_import->imp_obd->obd_name, |
1215 | req->rq_status); | |
1216 | } | |
1217 | ptlrpc_connect_import(req->rq_import); | |
1218 | } | |
1219 | ||
0a3bdb00 | 1220 | return 0; |
d7e09d03 PT |
1221 | } |
1222 | ||
1223 | /** | |
1224 | * Let server know that we have no requests to replay anymore. | |
1225 | * Achieved by just sending a PING request | |
1226 | */ | |
1227 | static int signal_completed_replay(struct obd_import *imp) | |
1228 | { | |
1229 | struct ptlrpc_request *req; | |
d7e09d03 PT |
1230 | |
1231 | if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_FINISH_REPLAY))) | |
0a3bdb00 | 1232 | return 0; |
d7e09d03 PT |
1233 | |
1234 | LASSERT(atomic_read(&imp->imp_replay_inflight) == 0); | |
1235 | atomic_inc(&imp->imp_replay_inflight); | |
1236 | ||
1237 | req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, LUSTRE_OBD_VERSION, | |
1238 | OBD_PING); | |
8b382089 | 1239 | if (!req) { |
d7e09d03 | 1240 | atomic_dec(&imp->imp_replay_inflight); |
0a3bdb00 | 1241 | return -ENOMEM; |
d7e09d03 PT |
1242 | } |
1243 | ||
1244 | ptlrpc_request_set_replen(req); | |
1245 | req->rq_send_state = LUSTRE_IMP_REPLAY_WAIT; | |
1246 | lustre_msg_add_flags(req->rq_reqmsg, | |
1247 | MSG_LOCK_REPLAY_DONE | MSG_REQ_REPLAY_DONE); | |
1248 | if (AT_OFF) | |
1249 | req->rq_timeout *= 3; | |
1250 | req->rq_interpret_reply = completed_replay_interpret; | |
1251 | ||
c5c4c6fa | 1252 | ptlrpcd_add_req(req); |
0a3bdb00 | 1253 | return 0; |
d7e09d03 PT |
1254 | } |
1255 | ||
1256 | /** | |
1257 | * In kernel code all import invalidation happens in its own | |
1258 | * separate thread, so that whatever application happened to encounter | |
1259 | * a problem could still be killed or otherwise continue | |
1260 | */ | |
1261 | static int ptlrpc_invalidate_import_thread(void *data) | |
1262 | { | |
1263 | struct obd_import *imp = data; | |
1264 | ||
d7e09d03 PT |
1265 | unshare_fs_struct(); |
1266 | ||
1267 | CDEBUG(D_HA, "thread invalidate import %s to %s@%s\n", | |
1268 | imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd), | |
1269 | imp->imp_connection->c_remote_uuid.uuid); | |
1270 | ||
1271 | ptlrpc_invalidate_import(imp); | |
1272 | ||
1273 | if (obd_dump_on_eviction) { | |
1274 | CERROR("dump the log upon eviction\n"); | |
1275 | libcfs_debug_dumplog(); | |
1276 | } | |
1277 | ||
1278 | IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER); | |
1279 | ptlrpc_import_recovery_state_machine(imp); | |
1280 | ||
1281 | class_import_put(imp); | |
0a3bdb00 | 1282 | return 0; |
d7e09d03 PT |
1283 | } |
1284 | ||
1285 | /** | |
1286 | * This is the state machine for client-side recovery on import. | |
1287 | * | |
b6da17f3 | 1288 | * Typically we have two possibly paths. If we came to server and it is not |
d7e09d03 PT |
1289 | * in recovery, we just enter IMP_EVICTED state, invalidate our import |
1290 | * state and reconnect from scratch. | |
1291 | * If we came to server that is in recovery, we enter IMP_REPLAY import state. | |
1292 | * We go through our list of requests to replay and send them to server one by | |
1293 | * one. | |
1294 | * After sending all request from the list we change import state to | |
1295 | * IMP_REPLAY_LOCKS and re-request all the locks we believe we have from server | |
1296 | * and also all the locks we don't yet have and wait for server to grant us. | |
1297 | * After that we send a special "replay completed" request and change import | |
1298 | * state to IMP_REPLAY_WAIT. | |
1299 | * Upon receiving reply to that "replay completed" RPC we enter IMP_RECOVER | |
1300 | * state and resend all requests from sending list. | |
1301 | * After that we promote import to FULL state and send all delayed requests | |
1302 | * and import is fully operational after that. | |
1303 | * | |
1304 | */ | |
1305 | int ptlrpc_import_recovery_state_machine(struct obd_import *imp) | |
1306 | { | |
1307 | int rc = 0; | |
1308 | int inflight; | |
1309 | char *target_start; | |
1310 | int target_len; | |
1311 | ||
d7e09d03 PT |
1312 | if (imp->imp_state == LUSTRE_IMP_EVICTED) { |
1313 | deuuidify(obd2cli_tgt(imp->imp_obd), NULL, | |
1314 | &target_start, &target_len); | |
1315 | /* Don't care about MGC eviction */ | |
1316 | if (strcmp(imp->imp_obd->obd_type->typ_name, | |
1317 | LUSTRE_MGC_NAME) != 0) { | |
2d00bd17 | 1318 | LCONSOLE_ERROR_MSG(0x167, "%s: This client was evicted by %.*s; in progress operations using this service will fail.\n", |
d7e09d03 PT |
1319 | imp->imp_obd->obd_name, target_len, |
1320 | target_start); | |
1321 | } | |
1322 | CDEBUG(D_HA, "evicted from %s@%s; invalidating\n", | |
1323 | obd2cli_tgt(imp->imp_obd), | |
1324 | imp->imp_connection->c_remote_uuid.uuid); | |
1325 | /* reset vbr_failed flag upon eviction */ | |
1326 | spin_lock(&imp->imp_lock); | |
1327 | imp->imp_vbr_failed = 0; | |
1328 | spin_unlock(&imp->imp_lock); | |
1329 | ||
1330 | { | |
68b636b6 | 1331 | struct task_struct *task; |
d7e09d03 | 1332 | /* bug 17802: XXX client_disconnect_export vs connect request |
9c379663 | 1333 | * race. if client is evicted at this time, we start |
d7e09d03 | 1334 | * invalidate thread without reference to import and import can |
dadfcdab OD |
1335 | * be freed at same time. |
1336 | */ | |
d7e09d03 PT |
1337 | class_import_get(imp); |
1338 | task = kthread_run(ptlrpc_invalidate_import_thread, imp, | |
30c0aa39 | 1339 | "ll_imp_inval"); |
d7e09d03 PT |
1340 | if (IS_ERR(task)) { |
1341 | class_import_put(imp); | |
1342 | CERROR("error starting invalidate thread: %d\n", rc); | |
1343 | rc = PTR_ERR(task); | |
1344 | } else { | |
1345 | rc = 0; | |
1346 | } | |
0a3bdb00 | 1347 | return rc; |
d7e09d03 PT |
1348 | } |
1349 | } | |
1350 | ||
1351 | if (imp->imp_state == LUSTRE_IMP_REPLAY) { | |
1352 | CDEBUG(D_HA, "replay requested by %s\n", | |
1353 | obd2cli_tgt(imp->imp_obd)); | |
1354 | rc = ptlrpc_replay_next(imp, &inflight); | |
1355 | if (inflight == 0 && | |
1356 | atomic_read(&imp->imp_replay_inflight) == 0) { | |
1357 | IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS); | |
1358 | rc = ldlm_replay_locks(imp); | |
1359 | if (rc) | |
a9b3e8f3 | 1360 | goto out; |
d7e09d03 PT |
1361 | } |
1362 | rc = 0; | |
1363 | } | |
1364 | ||
1365 | if (imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS) { | |
1366 | if (atomic_read(&imp->imp_replay_inflight) == 0) { | |
1367 | IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_WAIT); | |
1368 | rc = signal_completed_replay(imp); | |
1369 | if (rc) | |
a9b3e8f3 | 1370 | goto out; |
d7e09d03 | 1371 | } |
d7e09d03 PT |
1372 | } |
1373 | ||
1374 | if (imp->imp_state == LUSTRE_IMP_REPLAY_WAIT) { | |
1375 | if (atomic_read(&imp->imp_replay_inflight) == 0) { | |
1376 | IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER); | |
1377 | } | |
1378 | } | |
1379 | ||
1380 | if (imp->imp_state == LUSTRE_IMP_RECOVER) { | |
1381 | CDEBUG(D_HA, "reconnected to %s@%s\n", | |
1382 | obd2cli_tgt(imp->imp_obd), | |
1383 | imp->imp_connection->c_remote_uuid.uuid); | |
1384 | ||
1385 | rc = ptlrpc_resend(imp); | |
1386 | if (rc) | |
a9b3e8f3 | 1387 | goto out; |
d7e09d03 PT |
1388 | IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL); |
1389 | ptlrpc_activate_import(imp); | |
1390 | ||
1391 | deuuidify(obd2cli_tgt(imp->imp_obd), NULL, | |
1392 | &target_start, &target_len); | |
1393 | LCONSOLE_INFO("%s: Connection restored to %.*s (at %s)\n", | |
1394 | imp->imp_obd->obd_name, | |
1395 | target_len, target_start, | |
1396 | libcfs_nid2str(imp->imp_connection->c_peer.nid)); | |
1397 | } | |
1398 | ||
1399 | if (imp->imp_state == LUSTRE_IMP_FULL) { | |
1400 | wake_up_all(&imp->imp_recovery_waitq); | |
1401 | ptlrpc_wake_delayed(imp); | |
1402 | } | |
1403 | ||
1404 | out: | |
0a3bdb00 | 1405 | return rc; |
d7e09d03 PT |
1406 | } |
1407 | ||
1408 | int ptlrpc_disconnect_import(struct obd_import *imp, int noclose) | |
1409 | { | |
1410 | struct ptlrpc_request *req; | |
1411 | int rq_opc, rc = 0; | |
d7e09d03 | 1412 | |
88291a7a | 1413 | if (imp->imp_obd->obd_force) |
a9b3e8f3 | 1414 | goto set_state; |
d7e09d03 PT |
1415 | |
1416 | switch (imp->imp_connect_op) { | |
88291a7a AD |
1417 | case OST_CONNECT: |
1418 | rq_opc = OST_DISCONNECT; | |
1419 | break; | |
1420 | case MDS_CONNECT: | |
1421 | rq_opc = MDS_DISCONNECT; | |
1422 | break; | |
1423 | case MGS_CONNECT: | |
1424 | rq_opc = MGS_DISCONNECT; | |
1425 | break; | |
d7e09d03 | 1426 | default: |
88291a7a | 1427 | rc = -EINVAL; |
2d00bd17 | 1428 | CERROR("%s: don't know how to disconnect from %s (connect_op %d): rc = %d\n", |
88291a7a AD |
1429 | imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd), |
1430 | imp->imp_connect_op, rc); | |
1431 | return rc; | |
d7e09d03 PT |
1432 | } |
1433 | ||
1434 | if (ptlrpc_import_in_recovery(imp)) { | |
1435 | struct l_wait_info lwi; | |
b2d201bd | 1436 | long timeout; |
d7e09d03 | 1437 | |
d7e09d03 PT |
1438 | if (AT_OFF) { |
1439 | if (imp->imp_server_timeout) | |
1440 | timeout = cfs_time_seconds(obd_timeout / 2); | |
1441 | else | |
1442 | timeout = cfs_time_seconds(obd_timeout); | |
1443 | } else { | |
1444 | int idx = import_at_get_index(imp, | |
1445 | imp->imp_client->cli_request_portal); | |
1446 | timeout = cfs_time_seconds( | |
1447 | at_get(&imp->imp_at.iat_service_estimate[idx])); | |
1448 | } | |
1449 | ||
1450 | lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout), | |
1451 | back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL); | |
1452 | rc = l_wait_event(imp->imp_recovery_waitq, | |
1453 | !ptlrpc_import_in_recovery(imp), &lwi); | |
d7e09d03 PT |
1454 | } |
1455 | ||
1456 | spin_lock(&imp->imp_lock); | |
1457 | if (imp->imp_state != LUSTRE_IMP_FULL) | |
a9b3e8f3 | 1458 | goto out; |
d7e09d03 PT |
1459 | spin_unlock(&imp->imp_lock); |
1460 | ||
1461 | req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_DISCONNECT, | |
1462 | LUSTRE_OBD_VERSION, rq_opc); | |
1463 | if (req) { | |
1464 | /* We are disconnecting, do not retry a failed DISCONNECT rpc if | |
1465 | * it fails. We can get through the above with a down server | |
dadfcdab OD |
1466 | * if the client doesn't know the server is gone yet. |
1467 | */ | |
d7e09d03 PT |
1468 | req->rq_no_resend = 1; |
1469 | ||
1470 | /* We want client umounts to happen quickly, no matter the | |
dadfcdab OD |
1471 | * server state... |
1472 | */ | |
d7e09d03 PT |
1473 | req->rq_timeout = min_t(int, req->rq_timeout, |
1474 | INITIAL_CONNECT_TIMEOUT); | |
1475 | ||
1476 | IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING); | |
d0bfef31 | 1477 | req->rq_send_state = LUSTRE_IMP_CONNECTING; |
d7e09d03 PT |
1478 | ptlrpc_request_set_replen(req); |
1479 | rc = ptlrpc_queue_wait(req); | |
1480 | ptlrpc_req_finished(req); | |
1481 | } | |
1482 | ||
1483 | set_state: | |
1484 | spin_lock(&imp->imp_lock); | |
1485 | out: | |
1486 | if (noclose) | |
1487 | IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON); | |
1488 | else | |
1489 | IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED); | |
1490 | memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle)); | |
1491 | spin_unlock(&imp->imp_lock); | |
1492 | ||
88291a7a AD |
1493 | if (rc == -ETIMEDOUT || rc == -ENOTCONN || rc == -ESHUTDOWN) |
1494 | rc = 0; | |
1495 | ||
0a3bdb00 | 1496 | return rc; |
d7e09d03 PT |
1497 | } |
1498 | EXPORT_SYMBOL(ptlrpc_disconnect_import); | |
1499 | ||
d7e09d03 PT |
1500 | /* Adaptive Timeout utils */ |
1501 | extern unsigned int at_min, at_max, at_history; | |
1502 | ||
31c5e95e CH |
1503 | /* |
1504 | *Update at_current with the specified value (bounded by at_min and at_max), | |
1505 | * as well as the AT history "bins". | |
1506 | * - Bin into timeslices using AT_BINS bins. | |
1507 | * - This gives us a max of the last at_history seconds without the storage, | |
1508 | * but still smoothing out a return to normalcy from a slow response. | |
1509 | * - (E.g. remember the maximum latency in each minute of the last 4 minutes.) | |
dadfcdab | 1510 | */ |
d7e09d03 PT |
1511 | int at_measured(struct adaptive_timeout *at, unsigned int val) |
1512 | { | |
1513 | unsigned int old = at->at_current; | |
0ac0478b AB |
1514 | time64_t now = ktime_get_real_seconds(); |
1515 | long binlimit = max_t(long, at_history / AT_BINS, 1); | |
d7e09d03 PT |
1516 | |
1517 | LASSERT(at); | |
1518 | CDEBUG(D_OTHER, "add %u to %p time=%lu v=%u (%u %u %u %u)\n", | |
0ac0478b | 1519 | val, at, (long)(now - at->at_binstart), at->at_current, |
d7e09d03 PT |
1520 | at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]); |
1521 | ||
1522 | if (val == 0) | |
1523 | /* 0's don't count, because we never want our timeout to | |
dadfcdab OD |
1524 | * drop to 0, and because 0 could mean an error |
1525 | */ | |
d7e09d03 PT |
1526 | return 0; |
1527 | ||
1528 | spin_lock(&at->at_lock); | |
1529 | ||
1530 | if (unlikely(at->at_binstart == 0)) { | |
1531 | /* Special case to remove default from history */ | |
1532 | at->at_current = val; | |
1533 | at->at_worst_ever = val; | |
1534 | at->at_worst_time = now; | |
1535 | at->at_hist[0] = val; | |
1536 | at->at_binstart = now; | |
3949015e | 1537 | } else if (now - at->at_binstart < binlimit) { |
d7e09d03 PT |
1538 | /* in bin 0 */ |
1539 | at->at_hist[0] = max(val, at->at_hist[0]); | |
1540 | at->at_current = max(val, at->at_current); | |
1541 | } else { | |
1542 | int i, shift; | |
1543 | unsigned int maxv = val; | |
1544 | /* move bins over */ | |
0ac0478b | 1545 | shift = (u32)(now - at->at_binstart) / binlimit; |
d7e09d03 | 1546 | LASSERT(shift > 0); |
3949015e | 1547 | for (i = AT_BINS - 1; i >= 0; i--) { |
d7e09d03 PT |
1548 | if (i >= shift) { |
1549 | at->at_hist[i] = at->at_hist[i - shift]; | |
1550 | maxv = max(maxv, at->at_hist[i]); | |
1551 | } else { | |
1552 | at->at_hist[i] = 0; | |
1553 | } | |
1554 | } | |
1555 | at->at_hist[0] = val; | |
1556 | at->at_current = maxv; | |
1557 | at->at_binstart += shift * binlimit; | |
1558 | } | |
1559 | ||
1560 | if (at->at_current > at->at_worst_ever) { | |
1561 | at->at_worst_ever = at->at_current; | |
1562 | at->at_worst_time = now; | |
1563 | } | |
1564 | ||
1565 | if (at->at_flags & AT_FLG_NOHIST) | |
1566 | /* Only keep last reported val; keeping the rest of the history | |
dadfcdab OD |
1567 | * for debugfs only |
1568 | */ | |
d7e09d03 PT |
1569 | at->at_current = val; |
1570 | ||
1571 | if (at_max > 0) | |
1572 | at->at_current = min(at->at_current, at_max); | |
1573 | at->at_current = max(at->at_current, at_min); | |
1574 | ||
1575 | if (at->at_current != old) | |
2d00bd17 JP |
1576 | CDEBUG(D_OTHER, "AT %p change: old=%u new=%u delta=%d (val=%u) hist %u %u %u %u\n", |
1577 | at, | |
d7e09d03 PT |
1578 | old, at->at_current, at->at_current - old, val, |
1579 | at->at_hist[0], at->at_hist[1], at->at_hist[2], | |
1580 | at->at_hist[3]); | |
1581 | ||
1582 | /* if we changed, report the old value */ | |
1583 | old = (at->at_current != old) ? old : 0; | |
1584 | ||
1585 | spin_unlock(&at->at_lock); | |
1586 | return old; | |
1587 | } | |
1588 | ||
1589 | /* Find the imp_at index for a given portal; assign if space available */ | |
1590 | int import_at_get_index(struct obd_import *imp, int portal) | |
1591 | { | |
1592 | struct imp_at *at = &imp->imp_at; | |
1593 | int i; | |
1594 | ||
1595 | for (i = 0; i < IMP_AT_MAX_PORTALS; i++) { | |
1596 | if (at->iat_portal[i] == portal) | |
1597 | return i; | |
1598 | if (at->iat_portal[i] == 0) | |
1599 | /* unused */ | |
1600 | break; | |
1601 | } | |
1602 | ||
1603 | /* Not found in list, add it under a lock */ | |
1604 | spin_lock(&imp->imp_lock); | |
1605 | ||
1606 | /* Check unused under lock */ | |
1607 | for (; i < IMP_AT_MAX_PORTALS; i++) { | |
1608 | if (at->iat_portal[i] == portal) | |
1609 | goto out; | |
1610 | if (at->iat_portal[i] == 0) | |
1611 | /* unused */ | |
1612 | break; | |
1613 | } | |
1614 | ||
1615 | /* Not enough portals? */ | |
1616 | LASSERT(i < IMP_AT_MAX_PORTALS); | |
1617 | ||
1618 | at->iat_portal[i] = portal; | |
1619 | out: | |
1620 | spin_unlock(&imp->imp_lock); | |
1621 | return i; | |
1622 | } |