Merge remote-tracking branch 'staging/staging-next'
[deliverable/linux.git] / drivers / staging / lustre / lustre / ptlrpc / import.c
CommitLineData
d7e09d03
PT
1/*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
6a5b99a4 18 * http://www.gnu.org/licenses/gpl-2.0.html
d7e09d03 19 *
d7e09d03
PT
20 * GPL HEADER END
21 */
22/*
23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
25 *
1dc563a6 26 * Copyright (c) 2011, 2015, Intel Corporation.
d7e09d03
PT
27 */
28/*
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
31 *
32 * lustre/ptlrpc/import.c
33 *
34 * Author: Mike Shaver <shaver@clusterfs.com>
35 */
36
37#define DEBUG_SUBSYSTEM S_RPC
38
e27db149
GKH
39#include "../include/obd_support.h"
40#include "../include/lustre_ha.h"
41#include "../include/lustre_net.h"
42#include "../include/lustre_import.h"
43#include "../include/lustre_export.h"
44#include "../include/obd.h"
45#include "../include/obd_cksum.h"
46#include "../include/obd_class.h"
d7e09d03
PT
47
48#include "ptlrpc_internal.h"
49
50struct ptlrpc_connect_async_args {
51 __u64 pcaa_peer_committed;
52 int pcaa_initial_connect;
53};
54
55/**
56 * Updates import \a imp current state to provided \a state value
57 * Helper function. Must be called under imp_lock.
58 */
59static void __import_set_state(struct obd_import *imp,
60 enum lustre_imp_state state)
61{
502cb58e
AS
62 switch (state) {
63 case LUSTRE_IMP_CLOSED:
64 case LUSTRE_IMP_NEW:
65 case LUSTRE_IMP_DISCON:
66 case LUSTRE_IMP_CONNECTING:
67 break;
68 case LUSTRE_IMP_REPLAY_WAIT:
69 imp->imp_replay_state = LUSTRE_IMP_REPLAY_LOCKS;
70 break;
71 default:
72 imp->imp_replay_state = LUSTRE_IMP_REPLAY;
73 }
74
d7e09d03
PT
75 imp->imp_state = state;
76 imp->imp_state_hist[imp->imp_state_hist_idx].ish_state = state;
77 imp->imp_state_hist[imp->imp_state_hist_idx].ish_time =
74e489aa 78 ktime_get_real_seconds();
d7e09d03
PT
79 imp->imp_state_hist_idx = (imp->imp_state_hist_idx + 1) %
80 IMP_STATE_HIST_LEN;
81}
82
83/* A CLOSED import should remain so. */
532118c0
KM
84#define IMPORT_SET_STATE_NOLOCK(imp, state) \
85do { \
86 if (imp->imp_state != LUSTRE_IMP_CLOSED) { \
87 CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n", \
88 imp, obd2cli_tgt(imp->imp_obd), \
89 ptlrpc_import_state_name(imp->imp_state), \
90 ptlrpc_import_state_name(state)); \
91 __import_set_state(imp, state); \
92 } \
3949015e 93} while (0)
d7e09d03
PT
94
95#define IMPORT_SET_STATE(imp, state) \
96do { \
97 spin_lock(&imp->imp_lock); \
98 IMPORT_SET_STATE_NOLOCK(imp, state); \
99 spin_unlock(&imp->imp_lock); \
3949015e 100} while (0)
d7e09d03 101
d7e09d03
PT
102static int ptlrpc_connect_interpret(const struct lu_env *env,
103 struct ptlrpc_request *request,
aff9d8e8 104 void *data, int rc);
d7e09d03
PT
105int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
106
107/* Only this function is allowed to change the import state when it is
108 * CLOSED. I would rather refcount the import and free it after
109 * disconnection like we do with exports. To do that, the client_obd
110 * will need to save the peer info somewhere other than in the import,
dadfcdab
OD
111 * though.
112 */
d7e09d03
PT
113int ptlrpc_init_import(struct obd_import *imp)
114{
115 spin_lock(&imp->imp_lock);
116
117 imp->imp_generation++;
d0bfef31 118 imp->imp_state = LUSTRE_IMP_NEW;
d7e09d03
PT
119
120 spin_unlock(&imp->imp_lock);
121
122 return 0;
123}
124EXPORT_SYMBOL(ptlrpc_init_import);
125
126#define UUID_STR "_UUID"
9dd7d427
SB
127static void deuuidify(char *uuid, const char *prefix, char **uuid_start,
128 int *uuid_len)
d7e09d03
PT
129{
130 *uuid_start = !prefix || strncmp(uuid, prefix, strlen(prefix))
131 ? uuid : uuid + strlen(prefix);
132
133 *uuid_len = strlen(*uuid_start);
134
135 if (*uuid_len < strlen(UUID_STR))
136 return;
137
138 if (!strncmp(*uuid_start + *uuid_len - strlen(UUID_STR),
30c0aa39 139 UUID_STR, strlen(UUID_STR)))
d7e09d03
PT
140 *uuid_len -= strlen(UUID_STR);
141}
d7e09d03
PT
142
143/**
144 * Returns true if import was FULL, false if import was already not
145 * connected.
146 * @imp - import to be disconnected
147 * @conn_cnt - connection count (epoch) of the request that timed out
148 * and caused the disconnection. In some cases, multiple
149 * inflight requests can fail to a single target (e.g. OST
150 * bulk requests) and if one has already caused a reconnection
151 * (increasing the import->conn_cnt) the older failure should
152 * not also cause a reconnection. If zero it forces a reconnect.
153 */
154int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt)
155{
156 int rc = 0;
157
158 spin_lock(&imp->imp_lock);
159
160 if (imp->imp_state == LUSTRE_IMP_FULL &&
161 (conn_cnt == 0 || conn_cnt == imp->imp_conn_cnt)) {
162 char *target_start;
163 int target_len;
164
165 deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
166 &target_start, &target_len);
167
168 if (imp->imp_replayable) {
2d00bd17
JP
169 LCONSOLE_WARN("%s: Connection to %.*s (at %s) was lost; in progress operations using this service will wait for recovery to complete\n",
170 imp->imp_obd->obd_name, target_len, target_start,
171 libcfs_nid2str(imp->imp_connection->c_peer.nid));
d7e09d03 172 } else {
2d00bd17
JP
173 LCONSOLE_ERROR_MSG(0x166, "%s: Connection to %.*s (at %s) was lost; in progress operations using this service will fail\n",
174 imp->imp_obd->obd_name,
175 target_len, target_start,
176 libcfs_nid2str(imp->imp_connection->c_peer.nid));
d7e09d03 177 }
d7e09d03
PT
178 IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
179 spin_unlock(&imp->imp_lock);
180
181 if (obd_dump_on_timeout)
182 libcfs_debug_dumplog();
183
184 obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON);
185 rc = 1;
186 } else {
187 spin_unlock(&imp->imp_lock);
188 CDEBUG(D_HA, "%s: import %p already %s (conn %u, was %u): %s\n",
189 imp->imp_client->cli_name, imp,
190 (imp->imp_state == LUSTRE_IMP_FULL &&
191 imp->imp_conn_cnt > conn_cnt) ?
192 "reconnected" : "not connected", imp->imp_conn_cnt,
193 conn_cnt, ptlrpc_import_state_name(imp->imp_state));
194 }
195
196 return rc;
197}
198
441fda84
ML
199/*
200 * This acts as a barrier; all existing requests are rejected, and
201 * no new requests will be accepted until the import is valid again.
202 */
203void ptlrpc_deactivate_import(struct obd_import *imp)
d7e09d03 204{
d7e09d03 205 CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd));
441fda84
ML
206
207 spin_lock(&imp->imp_lock);
d7e09d03
PT
208 imp->imp_invalid = 1;
209 imp->imp_generation++;
210 spin_unlock(&imp->imp_lock);
211
212 ptlrpc_abort_inflight(imp);
213 obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
d7e09d03 214}
d7e09d03
PT
215EXPORT_SYMBOL(ptlrpc_deactivate_import);
216
217static unsigned int
219e6de6 218ptlrpc_inflight_deadline(struct ptlrpc_request *req, time64_t now)
d7e09d03
PT
219{
220 long dl;
221
222 if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
223 (req->rq_phase == RQ_PHASE_BULK) ||
224 (req->rq_phase == RQ_PHASE_NEW)))
225 return 0;
226
227 if (req->rq_timedout)
228 return 0;
229
230 if (req->rq_phase == RQ_PHASE_NEW)
231 dl = req->rq_sent;
232 else
233 dl = req->rq_deadline;
234
235 if (dl <= now)
236 return 0;
237
238 return dl - now;
239}
240
241static unsigned int ptlrpc_inflight_timeout(struct obd_import *imp)
242{
219e6de6 243 time64_t now = ktime_get_real_seconds();
d7e09d03
PT
244 struct list_head *tmp, *n;
245 struct ptlrpc_request *req;
246 unsigned int timeout = 0;
247
248 spin_lock(&imp->imp_lock);
249 list_for_each_safe(tmp, n, &imp->imp_sending_list) {
250 req = list_entry(tmp, struct ptlrpc_request, rq_list);
251 timeout = max(ptlrpc_inflight_deadline(req, now), timeout);
252 }
253 spin_unlock(&imp->imp_lock);
254 return timeout;
255}
256
257/**
258 * This function will invalidate the import, if necessary, then block
259 * for all the RPC completions, and finally notify the obd to
260 * invalidate its state (ie cancel locks, clear pending requests,
261 * etc).
262 */
263void ptlrpc_invalidate_import(struct obd_import *imp)
264{
265 struct list_head *tmp, *n;
266 struct ptlrpc_request *req;
267 struct l_wait_info lwi;
268 unsigned int timeout;
269 int rc;
270
271 atomic_inc(&imp->imp_inval_count);
272
273 if (!imp->imp_invalid || imp->imp_obd->obd_no_recov)
274 ptlrpc_deactivate_import(imp);
275
cca8fca1 276 CFS_FAIL_TIMEOUT(OBD_FAIL_MGS_CONNECT_NET, 3 * cfs_fail_val / 2);
d7e09d03
PT
277 LASSERT(imp->imp_invalid);
278
279 /* Wait forever until inflight == 0. We really can't do it another
280 * way because in some cases we need to wait for very long reply
281 * unlink. We can't do anything before that because there is really
dadfcdab
OD
282 * no guarantee that some rdma transfer is not in progress right now.
283 */
d7e09d03
PT
284 do {
285 /* Calculate max timeout for waiting on rpcs to error
286 * out. Use obd_timeout if calculated value is smaller
dadfcdab
OD
287 * than it.
288 */
d7e09d03
PT
289 if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
290 timeout = ptlrpc_inflight_timeout(imp);
291 timeout += timeout / 3;
292
293 if (timeout == 0)
294 timeout = obd_timeout;
295 } else {
296 /* decrease the interval to increase race condition */
297 timeout = 1;
298 }
299
1d8cb70c
GD
300 CDEBUG(D_RPCTRACE,
301 "Sleeping %d sec for inflight to error out\n",
d7e09d03
PT
302 timeout);
303
304 /* Wait for all requests to error out and call completion
305 * callbacks. Cap it at obd_timeout -- these should all
dadfcdab
OD
306 * have been locally cancelled by ptlrpc_abort_inflight.
307 */
d7e09d03
PT
308 lwi = LWI_TIMEOUT_INTERVAL(
309 cfs_timeout_cap(cfs_time_seconds(timeout)),
cd94f231
OD
310 (timeout > 1) ? cfs_time_seconds(1) :
311 cfs_time_seconds(1) / 2,
d7e09d03
PT
312 NULL, NULL);
313 rc = l_wait_event(imp->imp_recovery_waitq,
314 (atomic_read(&imp->imp_inflight) == 0),
315 &lwi);
316 if (rc) {
317 const char *cli_tgt = obd2cli_tgt(imp->imp_obd);
318
319 CERROR("%s: rc = %d waiting for callback (%d != 0)\n",
320 cli_tgt, rc,
321 atomic_read(&imp->imp_inflight));
322
323 spin_lock(&imp->imp_lock);
324 if (atomic_read(&imp->imp_inflight) == 0) {
325 int count = atomic_read(&imp->imp_unregistering);
326
327 /* We know that "unregistering" rpcs only can
328 * survive in sending or delaying lists (they
329 * maybe waiting for long reply unlink in
330 * sluggish nets). Let's check this. If there
331 * is no inflight and unregistering != 0, this
dadfcdab
OD
332 * is bug.
333 */
2d00bd17
JP
334 LASSERTF(count == 0, "Some RPCs are still unregistering: %d\n",
335 count);
d7e09d03
PT
336
337 /* Let's save one loop as soon as inflight have
338 * dropped to zero. No new inflights possible at
dadfcdab
OD
339 * this point.
340 */
d7e09d03
PT
341 rc = 0;
342 } else {
343 list_for_each_safe(tmp, n,
30c0aa39 344 &imp->imp_sending_list) {
d7e09d03 345 req = list_entry(tmp,
30c0aa39
OD
346 struct ptlrpc_request,
347 rq_list);
d7e09d03
PT
348 DEBUG_REQ(D_ERROR, req,
349 "still on sending list");
350 }
351 list_for_each_safe(tmp, n,
30c0aa39 352 &imp->imp_delayed_list) {
d7e09d03 353 req = list_entry(tmp,
30c0aa39
OD
354 struct ptlrpc_request,
355 rq_list);
d7e09d03
PT
356 DEBUG_REQ(D_ERROR, req,
357 "still on delayed list");
358 }
359
81ea39ec 360 CERROR("%s: Unregistering RPCs found (%d). Network is sluggish? Waiting them to error out.\n",
2d00bd17 361 cli_tgt,
d7e09d03 362 atomic_read(&imp->
2d00bd17 363 imp_unregistering));
d7e09d03
PT
364 }
365 spin_unlock(&imp->imp_lock);
d0bfef31 366 }
d7e09d03
PT
367 } while (rc != 0);
368
369 /*
370 * Let's additionally check that no new rpcs added to import in
371 * "invalidate" state.
372 */
373 LASSERT(atomic_read(&imp->imp_inflight) == 0);
374 obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE);
375 sptlrpc_import_flush_all_ctx(imp);
376
377 atomic_dec(&imp->imp_inval_count);
378 wake_up_all(&imp->imp_recovery_waitq);
379}
380EXPORT_SYMBOL(ptlrpc_invalidate_import);
381
382/* unset imp_invalid */
383void ptlrpc_activate_import(struct obd_import *imp)
384{
385 struct obd_device *obd = imp->imp_obd;
386
387 spin_lock(&imp->imp_lock);
0b291b9a
HZ
388 if (imp->imp_deactive != 0) {
389 spin_unlock(&imp->imp_lock);
390 return;
391 }
392
d7e09d03 393 imp->imp_invalid = 0;
d7e09d03
PT
394 spin_unlock(&imp->imp_lock);
395 obd_import_event(obd, imp, IMP_EVENT_ACTIVE);
396}
397EXPORT_SYMBOL(ptlrpc_activate_import);
398
cca8fca1
AS
399static void ptlrpc_pinger_force(struct obd_import *imp)
400{
401 CDEBUG(D_HA, "%s: waking up pinger s:%s\n", obd2cli_tgt(imp->imp_obd),
402 ptlrpc_import_state_name(imp->imp_state));
403
404 spin_lock(&imp->imp_lock);
405 imp->imp_force_verify = 1;
406 spin_unlock(&imp->imp_lock);
407
408 if (imp->imp_state != LUSTRE_IMP_CONNECTING)
409 ptlrpc_pinger_wake_up();
410}
411
d7e09d03
PT
412void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt)
413{
d7e09d03
PT
414 LASSERT(!imp->imp_dlm_fake);
415
416 if (ptlrpc_set_import_discon(imp, conn_cnt)) {
417 if (!imp->imp_replayable) {
2d00bd17 418 CDEBUG(D_HA, "import %s@%s for %s not replayable, auto-deactivating\n",
d7e09d03
PT
419 obd2cli_tgt(imp->imp_obd),
420 imp->imp_connection->c_remote_uuid.uuid,
421 imp->imp_obd->obd_name);
422 ptlrpc_deactivate_import(imp);
423 }
424
cca8fca1 425 ptlrpc_pinger_force(imp);
d7e09d03 426 }
d7e09d03
PT
427}
428EXPORT_SYMBOL(ptlrpc_fail_import);
429
430int ptlrpc_reconnect_import(struct obd_import *imp)
431{
cca8fca1
AS
432 struct l_wait_info lwi;
433 int secs = cfs_time_seconds(obd_timeout);
434 int rc;
435
436 ptlrpc_pinger_force(imp);
437
438 CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n",
439 obd2cli_tgt(imp->imp_obd), secs);
440
441 lwi = LWI_TIMEOUT(secs, NULL, NULL);
442 rc = l_wait_event(imp->imp_recovery_waitq,
443 !ptlrpc_import_in_recovery(imp), &lwi);
444 CDEBUG(D_HA, "%s: recovery finished s:%s\n", obd2cli_tgt(imp->imp_obd),
445 ptlrpc_import_state_name(imp->imp_state));
446 return rc;
d7e09d03
PT
447}
448EXPORT_SYMBOL(ptlrpc_reconnect_import);
449
450/**
451 * Connection on import \a imp is changed to another one (if more than one is
452 * present). We typically chose connection that we have not tried to connect to
453 * the longest
454 */
455static int import_select_connection(struct obd_import *imp)
456{
457 struct obd_import_conn *imp_conn = NULL, *conn;
458 struct obd_export *dlmexp;
459 char *target_start;
460 int target_len, tried_all = 1;
d7e09d03
PT
461
462 spin_lock(&imp->imp_lock);
463
464 if (list_empty(&imp->imp_conn_list)) {
465 CERROR("%s: no connections available\n",
466 imp->imp_obd->obd_name);
467 spin_unlock(&imp->imp_lock);
0a3bdb00 468 return -EINVAL;
d7e09d03
PT
469 }
470
471 list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
b0f5aad5 472 CDEBUG(D_HA, "%s: connect to NID %s last attempt %llu\n",
d7e09d03
PT
473 imp->imp_obd->obd_name,
474 libcfs_nid2str(conn->oic_conn->c_peer.nid),
475 conn->oic_last_attempt);
476
477 /* If we have not tried this connection since
dadfcdab
OD
478 * the last successful attempt, go with this one
479 */
d7e09d03
PT
480 if ((conn->oic_last_attempt == 0) ||
481 cfs_time_beforeq_64(conn->oic_last_attempt,
30c0aa39 482 imp->imp_last_success_conn)) {
d7e09d03
PT
483 imp_conn = conn;
484 tried_all = 0;
485 break;
486 }
487
488 /* If all of the connections have already been tried
dadfcdab
OD
489 * since the last successful connection; just choose the
490 * least recently used
491 */
d7e09d03
PT
492 if (!imp_conn)
493 imp_conn = conn;
494 else if (cfs_time_before_64(conn->oic_last_attempt,
495 imp_conn->oic_last_attempt))
496 imp_conn = conn;
497 }
498
499 /* if not found, simply choose the current one */
500 if (!imp_conn || imp->imp_force_reconnect) {
501 LASSERT(imp->imp_conn_current);
502 imp_conn = imp->imp_conn_current;
503 tried_all = 0;
504 }
505 LASSERT(imp_conn->oic_conn);
506
507 /* If we've tried everything, and we're back to the beginning of the
dadfcdab
OD
508 * list, increase our timeout and try again. It will be reset when
509 * we do finally connect. (FIXME: really we should wait for all network
510 * state associated with the last connection attempt to drain before
511 * trying to reconnect on it.)
512 */
d7e09d03
PT
513 if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) {
514 struct adaptive_timeout *at = &imp->imp_at.iat_net_latency;
50ffcb7e 515
d7e09d03
PT
516 if (at_get(at) < CONNECTION_SWITCH_MAX) {
517 at_measured(at, at_get(at) + CONNECTION_SWITCH_INC);
518 if (at_get(at) > CONNECTION_SWITCH_MAX)
519 at_reset(at, CONNECTION_SWITCH_MAX);
520 }
521 LASSERT(imp_conn->oic_last_attempt);
2d00bd17
JP
522 CDEBUG(D_HA, "%s: tried all connections, increasing latency to %ds\n",
523 imp->imp_obd->obd_name, at_get(at));
d7e09d03
PT
524 }
525
526 imp_conn->oic_last_attempt = cfs_time_current_64();
527
528 /* switch connection, don't mind if it's same as the current one */
a5cb8880 529 ptlrpc_connection_put(imp->imp_connection);
d7e09d03
PT
530 imp->imp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
531
d0bfef31 532 dlmexp = class_conn2export(&imp->imp_dlm_handle);
a5cb8880 533 ptlrpc_connection_put(dlmexp->exp_connection);
d7e09d03
PT
534 dlmexp->exp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
535 class_export_put(dlmexp);
536
537 if (imp->imp_conn_current != imp_conn) {
538 if (imp->imp_conn_current) {
539 deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
540 &target_start, &target_len);
541
2d00bd17 542 CDEBUG(D_HA, "%s: Connection changing to %.*s (at %s)\n",
d7e09d03
PT
543 imp->imp_obd->obd_name,
544 target_len, target_start,
545 libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
546 }
547
548 imp->imp_conn_current = imp_conn;
549 }
550
551 CDEBUG(D_HA, "%s: import %p using connection %s/%s\n",
552 imp->imp_obd->obd_name, imp, imp_conn->oic_uuid.uuid,
553 libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
554
555 spin_unlock(&imp->imp_lock);
556
0a3bdb00 557 return 0;
d7e09d03
PT
558}
559
560/*
561 * must be called under imp_lock
562 */
563static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno)
564{
565 struct ptlrpc_request *req;
566 struct list_head *tmp;
567
63d42578 568 /* The requests in committed_list always have smaller transnos than
dadfcdab
OD
569 * the requests in replay_list
570 */
63d42578
HZ
571 if (!list_empty(&imp->imp_committed_list)) {
572 tmp = imp->imp_committed_list.next;
573 req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
574 *transno = req->rq_transno;
575 if (req->rq_transno == 0) {
576 DEBUG_REQ(D_ERROR, req,
577 "zero transno in committed_list");
578 LBUG();
579 }
580 return 1;
d7e09d03 581 }
63d42578
HZ
582 if (!list_empty(&imp->imp_replay_list)) {
583 tmp = imp->imp_replay_list.next;
584 req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
585 *transno = req->rq_transno;
586 if (req->rq_transno == 0) {
587 DEBUG_REQ(D_ERROR, req, "zero transno in replay_list");
588 LBUG();
589 }
590 return 1;
591 }
592 return 0;
d7e09d03
PT
593}
594
595/**
596 * Attempt to (re)connect import \a imp. This includes all preparations,
597 * initializing CONNECT RPC request and passing it to ptlrpcd for
598 * actual sending.
599 * Returns 0 on success or error code.
600 */
601int ptlrpc_connect_import(struct obd_import *imp)
602{
603 struct obd_device *obd = imp->imp_obd;
604 int initial_connect = 0;
605 int set_transno = 0;
606 __u64 committed_before_reconnect = 0;
607 struct ptlrpc_request *request;
608 char *bufs[] = { NULL,
609 obd2cli_tgt(imp->imp_obd),
610 obd->obd_uuid.uuid,
611 (char *)&imp->imp_dlm_handle,
612 (char *)&imp->imp_connect_data };
613 struct ptlrpc_connect_async_args *aa;
614 int rc;
d7e09d03
PT
615
616 spin_lock(&imp->imp_lock);
617 if (imp->imp_state == LUSTRE_IMP_CLOSED) {
618 spin_unlock(&imp->imp_lock);
619 CERROR("can't connect to a closed import\n");
0a3bdb00 620 return -EINVAL;
d7e09d03
PT
621 } else if (imp->imp_state == LUSTRE_IMP_FULL) {
622 spin_unlock(&imp->imp_lock);
623 CERROR("already connected\n");
0a3bdb00 624 return 0;
d7e09d03
PT
625 } else if (imp->imp_state == LUSTRE_IMP_CONNECTING) {
626 spin_unlock(&imp->imp_lock);
627 CERROR("already connecting\n");
0a3bdb00 628 return -EALREADY;
d7e09d03
PT
629 }
630
631 IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CONNECTING);
632
633 imp->imp_conn_cnt++;
634 imp->imp_resend_replay = 0;
635
636 if (!lustre_handle_is_used(&imp->imp_remote_handle))
637 initial_connect = 1;
638 else
639 committed_before_reconnect = imp->imp_peer_committed_transno;
640
641 set_transno = ptlrpc_first_transno(imp,
642 &imp->imp_connect_data.ocd_transno);
643 spin_unlock(&imp->imp_lock);
644
645 rc = import_select_connection(imp);
646 if (rc)
a9b3e8f3 647 goto out;
d7e09d03 648
5bcfab13 649 rc = sptlrpc_import_sec_adapt(imp, NULL, NULL);
d7e09d03 650 if (rc)
a9b3e8f3 651 goto out;
d7e09d03
PT
652
653 /* Reset connect flags to the originally requested flags, in case
dadfcdab
OD
654 * the server is updated on-the-fly we will get the new features.
655 */
d7e09d03
PT
656 imp->imp_connect_data.ocd_connect_flags = imp->imp_connect_flags_orig;
657 /* Reset ocd_version each time so the server knows the exact versions */
658 imp->imp_connect_data.ocd_version = LUSTRE_VERSION_CODE;
659 imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
660 imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
661
662 rc = obd_reconnect(NULL, imp->imp_obd->obd_self_export, obd,
663 &obd->obd_uuid, &imp->imp_connect_data, NULL);
664 if (rc)
a9b3e8f3 665 goto out;
d7e09d03
PT
666
667 request = ptlrpc_request_alloc(imp, &RQF_MDS_CONNECT);
8b382089 668 if (!request) {
a9b3e8f3
JL
669 rc = -ENOMEM;
670 goto out;
671 }
d7e09d03
PT
672
673 rc = ptlrpc_request_bufs_pack(request, LUSTRE_OBD_VERSION,
674 imp->imp_connect_op, bufs, NULL);
675 if (rc) {
676 ptlrpc_request_free(request);
a9b3e8f3 677 goto out;
d7e09d03
PT
678 }
679
680 /* Report the rpc service time to the server so that it knows how long
dadfcdab
OD
681 * to wait for clients to join recovery
682 */
d7e09d03
PT
683 lustre_msg_set_service_time(request->rq_reqmsg,
684 at_timeout2est(request->rq_timeout));
685
686 /* The amount of time we give the server to process the connect req.
687 * import_select_connection will increase the net latency on
688 * repeated reconnect attempts to cover slow networks.
689 * We override/ignore the server rpc completion estimate here,
dadfcdab
OD
690 * which may be large if this is a reconnect attempt
691 */
d7e09d03
PT
692 request->rq_timeout = INITIAL_CONNECT_TIMEOUT;
693 lustre_msg_set_timeout(request->rq_reqmsg, request->rq_timeout);
694
695 lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_NEXT_VER);
696
3d2b8f57
NC
697 request->rq_no_resend = 1;
698 request->rq_no_delay = 1;
d7e09d03
PT
699 request->rq_send_state = LUSTRE_IMP_CONNECTING;
700 /* Allow a slightly larger reply for future growth compatibility */
701 req_capsule_set_size(&request->rq_pill, &RMF_CONNECT_DATA, RCL_SERVER,
cd94f231
OD
702 sizeof(struct obd_connect_data) +
703 16 * sizeof(__u64));
d7e09d03
PT
704 ptlrpc_request_set_replen(request);
705 request->rq_interpret_reply = ptlrpc_connect_interpret;
706
3949015e 707 CLASSERT(sizeof(*aa) <= sizeof(request->rq_async_args));
d7e09d03 708 aa = ptlrpc_req_async_args(request);
ec83e611 709 memset(aa, 0, sizeof(*aa));
d7e09d03
PT
710
711 aa->pcaa_peer_committed = committed_before_reconnect;
712 aa->pcaa_initial_connect = initial_connect;
713
714 if (aa->pcaa_initial_connect) {
715 spin_lock(&imp->imp_lock);
716 imp->imp_replayable = 1;
717 spin_unlock(&imp->imp_lock);
718 lustre_msg_add_op_flags(request->rq_reqmsg,
719 MSG_CONNECT_INITIAL);
720 }
721
722 if (set_transno)
723 lustre_msg_add_op_flags(request->rq_reqmsg,
724 MSG_CONNECT_TRANSNO);
725
726 DEBUG_REQ(D_RPCTRACE, request, "(re)connect request (timeout %d)",
727 request->rq_timeout);
c5c4c6fa 728 ptlrpcd_add_req(request);
d7e09d03
PT
729 rc = 0;
730out:
c5c4c6fa 731 if (rc != 0)
d7e09d03 732 IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
d7e09d03 733
0a3bdb00 734 return rc;
d7e09d03
PT
735}
736EXPORT_SYMBOL(ptlrpc_connect_import);
737
738static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp)
739{
740 int force_verify;
741
742 spin_lock(&imp->imp_lock);
743 force_verify = imp->imp_force_verify != 0;
744 spin_unlock(&imp->imp_lock);
745
746 if (force_verify)
747 ptlrpc_pinger_wake_up();
748}
749
750static int ptlrpc_busy_reconnect(int rc)
751{
752 return (rc == -EBUSY) || (rc == -EAGAIN);
753}
754
755/**
756 * interpret_reply callback for connect RPCs.
757 * Looks into returned status of connect operation and decides
758 * what to do with the import - i.e enter recovery, promote it to
759 * full state for normal operations of disconnect it due to an error.
760 */
761static int ptlrpc_connect_interpret(const struct lu_env *env,
762 struct ptlrpc_request *request,
763 void *data, int rc)
764{
765 struct ptlrpc_connect_async_args *aa = data;
766 struct obd_import *imp = request->rq_import;
767 struct client_obd *cli = &imp->imp_obd->u.cli;
768 struct lustre_handle old_hdl;
769 __u64 old_connect_flags;
770 int msg_flags;
771 struct obd_connect_data *ocd;
772 struct obd_export *exp;
773 int ret;
d7e09d03
PT
774
775 spin_lock(&imp->imp_lock);
776 if (imp->imp_state == LUSTRE_IMP_CLOSED) {
777 imp->imp_connect_tried = 1;
778 spin_unlock(&imp->imp_lock);
0a3bdb00 779 return 0;
d7e09d03
PT
780 }
781
782 if (rc) {
783 /* if this reconnect to busy export - not need select new target
dadfcdab
OD
784 * for connecting
785 */
d7e09d03
PT
786 imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc);
787 spin_unlock(&imp->imp_lock);
788 ptlrpc_maybe_ping_import_soon(imp);
a9b3e8f3 789 goto out;
d7e09d03
PT
790 }
791 spin_unlock(&imp->imp_lock);
792
793 LASSERT(imp->imp_conn_current);
794
795 msg_flags = lustre_msg_get_op_flags(request->rq_repmsg);
796
797 ret = req_capsule_get_size(&request->rq_pill, &RMF_CONNECT_DATA,
798 RCL_SERVER);
799 /* server replied obd_connect_data is always bigger */
800 ocd = req_capsule_server_sized_get(&request->rq_pill,
801 &RMF_CONNECT_DATA, ret);
802
8b382089 803 if (!ocd) {
d7e09d03
PT
804 CERROR("%s: no connect data from server\n",
805 imp->imp_obd->obd_name);
806 rc = -EPROTO;
a9b3e8f3 807 goto out;
d7e09d03
PT
808 }
809
810 spin_lock(&imp->imp_lock);
811
812 /* All imports are pingable */
813 imp->imp_pingable = 1;
814 imp->imp_force_reconnect = 0;
815 imp->imp_force_verify = 0;
816
817 imp->imp_connect_data = *ocd;
818
819 CDEBUG(D_HA, "%s: connect to target with instance %u\n",
820 imp->imp_obd->obd_name, ocd->ocd_instance);
821 exp = class_conn2export(&imp->imp_dlm_handle);
822
823 spin_unlock(&imp->imp_lock);
824
825 /* check that server granted subset of flags we asked for. */
826 if ((ocd->ocd_connect_flags & imp->imp_connect_flags_orig) !=
827 ocd->ocd_connect_flags) {
55f5a824 828 CERROR("%s: Server didn't granted asked subset of flags: asked=%#llx grranted=%#llx\n",
1d8cb70c 829 imp->imp_obd->obd_name, imp->imp_connect_flags_orig,
d7e09d03 830 ocd->ocd_connect_flags);
a9b3e8f3
JL
831 rc = -EPROTO;
832 goto out;
d7e09d03
PT
833 }
834
835 if (!exp) {
836 /* This could happen if export is cleaned during the
dadfcdab
OD
837 * connect attempt
838 */
d7e09d03
PT
839 CERROR("%s: missing export after connect\n",
840 imp->imp_obd->obd_name);
a9b3e8f3
JL
841 rc = -ENODEV;
842 goto out;
d7e09d03
PT
843 }
844 old_connect_flags = exp_connect_flags(exp);
845 exp->exp_connect_data = *ocd;
846 imp->imp_obd->obd_self_export->exp_connect_data = *ocd;
847 class_export_put(exp);
848
849 obd_import_event(imp->imp_obd, imp, IMP_EVENT_OCD);
850
851 if (aa->pcaa_initial_connect) {
852 spin_lock(&imp->imp_lock);
853 if (msg_flags & MSG_CONNECT_REPLAYABLE) {
854 imp->imp_replayable = 1;
855 spin_unlock(&imp->imp_lock);
856 CDEBUG(D_HA, "connected to replayable target: %s\n",
857 obd2cli_tgt(imp->imp_obd));
858 } else {
859 imp->imp_replayable = 0;
860 spin_unlock(&imp->imp_lock);
861 }
862
863 /* if applies, adjust the imp->imp_msg_magic here
dadfcdab
OD
864 * according to reply flags
865 */
d7e09d03
PT
866
867 imp->imp_remote_handle =
868 *lustre_msg_get_handle(request->rq_repmsg);
869
870 /* Initial connects are allowed for clients with non-random
871 * uuids when servers are in recovery. Simply signal the
dadfcdab
OD
872 * servers replay is complete and wait in REPLAY_WAIT.
873 */
d7e09d03
PT
874 if (msg_flags & MSG_CONNECT_RECOVERING) {
875 CDEBUG(D_HA, "connect to %s during recovery\n",
876 obd2cli_tgt(imp->imp_obd));
877 IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
878 } else {
879 IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
880 ptlrpc_activate_import(imp);
881 }
882
a9b3e8f3
JL
883 rc = 0;
884 goto finish;
d7e09d03
PT
885 }
886
887 /* Determine what recovery state to move the import to. */
2b241d31 888 if (msg_flags & MSG_CONNECT_RECONNECT) {
d7e09d03
PT
889 memset(&old_hdl, 0, sizeof(old_hdl));
890 if (!memcmp(&old_hdl, lustre_msg_get_handle(request->rq_repmsg),
3949015e 891 sizeof(old_hdl))) {
55f5a824 892 LCONSOLE_WARN("Reconnect to %s (at @%s) failed due bad handle %#llx\n",
d7e09d03
PT
893 obd2cli_tgt(imp->imp_obd),
894 imp->imp_connection->c_remote_uuid.uuid,
895 imp->imp_dlm_handle.cookie);
a9b3e8f3
JL
896 rc = -ENOTCONN;
897 goto out;
d7e09d03
PT
898 }
899
900 if (memcmp(&imp->imp_remote_handle,
901 lustre_msg_get_handle(request->rq_repmsg),
902 sizeof(imp->imp_remote_handle))) {
903 int level = msg_flags & MSG_CONNECT_RECOVERING ?
904 D_HA : D_WARNING;
905
906 /* Bug 16611/14775: if server handle have changed,
907 * that means some sort of disconnection happened.
908 * If the server is not in recovery, that also means it
909 * already erased all of our state because of previous
910 * eviction. If it is in recovery - we are safe to
911 * participate since we can reestablish all of our state
dadfcdab
OD
912 * with server again
913 */
2b241d31 914 if ((msg_flags & MSG_CONNECT_RECOVERING)) {
b533ff4b 915 CDEBUG(level, "%s@%s changed server handle from %#llx to %#llx but is still in recovery\n",
d7e09d03
PT
916 obd2cli_tgt(imp->imp_obd),
917 imp->imp_connection->c_remote_uuid.uuid,
918 imp->imp_remote_handle.cookie,
919 lustre_msg_get_handle(
920 request->rq_repmsg)->cookie);
921 } else {
2d00bd17 922 LCONSOLE_WARN("Evicted from %s (at %s) after server handle changed from %#llx to %#llx\n",
d7e09d03
PT
923 obd2cli_tgt(imp->imp_obd),
924 imp->imp_connection-> \
925 c_remote_uuid.uuid,
926 imp->imp_remote_handle.cookie,
927 lustre_msg_get_handle(
2d00bd17 928 request->rq_repmsg)->cookie);
d7e09d03
PT
929 }
930
d7e09d03
PT
931 imp->imp_remote_handle =
932 *lustre_msg_get_handle(request->rq_repmsg);
933
2b241d31 934 if (!(msg_flags & MSG_CONNECT_RECOVERING)) {
d7e09d03 935 IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
a9b3e8f3
JL
936 rc = 0;
937 goto finish;
d7e09d03
PT
938 }
939
940 } else {
941 CDEBUG(D_HA, "reconnected to %s@%s after partition\n",
942 obd2cli_tgt(imp->imp_obd),
943 imp->imp_connection->c_remote_uuid.uuid);
944 }
945
946 if (imp->imp_invalid) {
2d00bd17
JP
947 CDEBUG(D_HA, "%s: reconnected but import is invalid; marking evicted\n",
948 imp->imp_obd->obd_name);
d7e09d03 949 IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
2b241d31 950 } else if (msg_flags & MSG_CONNECT_RECOVERING) {
d7e09d03
PT
951 CDEBUG(D_HA, "%s: reconnected to %s during replay\n",
952 imp->imp_obd->obd_name,
953 obd2cli_tgt(imp->imp_obd));
954
955 spin_lock(&imp->imp_lock);
956 imp->imp_resend_replay = 1;
957 spin_unlock(&imp->imp_lock);
958
502cb58e 959 IMPORT_SET_STATE(imp, imp->imp_replay_state);
d7e09d03
PT
960 } else {
961 IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
962 }
2b241d31 963 } else if ((msg_flags & MSG_CONNECT_RECOVERING) && !imp->imp_invalid) {
d7e09d03
PT
964 LASSERT(imp->imp_replayable);
965 imp->imp_remote_handle =
966 *lustre_msg_get_handle(request->rq_repmsg);
967 imp->imp_last_replay_transno = 0;
968 IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
969 } else {
2d00bd17
JP
970 DEBUG_REQ(D_HA, request, "%s: evicting (reconnect/recover flags not set: %x)",
971 imp->imp_obd->obd_name, msg_flags);
d7e09d03
PT
972 imp->imp_remote_handle =
973 *lustre_msg_get_handle(request->rq_repmsg);
974 IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
975 }
976
977 /* Sanity checks for a reconnected import. */
978 if (!(imp->imp_replayable) != !(msg_flags & MSG_CONNECT_REPLAYABLE)) {
2d00bd17 979 CERROR("imp_replayable flag does not match server after reconnect. We should LBUG right here.\n");
d7e09d03
PT
980 }
981
982 if (lustre_msg_get_last_committed(request->rq_repmsg) > 0 &&
983 lustre_msg_get_last_committed(request->rq_repmsg) <
984 aa->pcaa_peer_committed) {
2d00bd17 985 CERROR("%s went back in time (transno %lld was previously committed, server now claims %lld)! See https://bugzilla.lustre.org/show_bug.cgi?id=9646\n",
d7e09d03
PT
986 obd2cli_tgt(imp->imp_obd), aa->pcaa_peer_committed,
987 lustre_msg_get_last_committed(request->rq_repmsg));
988 }
989
990finish:
991 rc = ptlrpc_import_recovery_state_machine(imp);
992 if (rc != 0) {
993 if (rc == -ENOTCONN) {
2d00bd17 994 CDEBUG(D_HA, "evicted/aborted by %s@%s during recovery; invalidating and reconnecting\n",
d7e09d03
PT
995 obd2cli_tgt(imp->imp_obd),
996 imp->imp_connection->c_remote_uuid.uuid);
997 ptlrpc_connect_import(imp);
998 imp->imp_connect_tried = 1;
0a3bdb00 999 return 0;
d7e09d03
PT
1000 }
1001 } else {
35e45816
AD
1002 static bool warned;
1003
d7e09d03
PT
1004 spin_lock(&imp->imp_lock);
1005 list_del(&imp->imp_conn_current->oic_item);
30c0aa39 1006 list_add(&imp->imp_conn_current->oic_item, &imp->imp_conn_list);
d7e09d03
PT
1007 imp->imp_last_success_conn =
1008 imp->imp_conn_current->oic_last_attempt;
1009
1010 spin_unlock(&imp->imp_lock);
1011
f261f48a
FY
1012 if ((imp->imp_connect_flags_orig & OBD_CONNECT_IBITS) &&
1013 !(ocd->ocd_connect_flags & OBD_CONNECT_IBITS)) {
2d00bd17 1014 LCONSOLE_WARN("%s: MDS %s does not support ibits lock, either very old or invalid: requested %llx, replied %llx\n",
f261f48a
FY
1015 imp->imp_obd->obd_name,
1016 imp->imp_connection->c_remote_uuid.uuid,
1017 imp->imp_connect_flags_orig,
1018 ocd->ocd_connect_flags);
a9b3e8f3
JL
1019 rc = -EPROTO;
1020 goto out;
f261f48a 1021 }
d7e09d03 1022
35e45816 1023 if (!warned && (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
d7e09d03
PT
1024 (ocd->ocd_version > LUSTRE_VERSION_CODE +
1025 LUSTRE_VERSION_OFFSET_WARN ||
1026 ocd->ocd_version < LUSTRE_VERSION_CODE -
1027 LUSTRE_VERSION_OFFSET_WARN)) {
1028 /* Sigh, some compilers do not like #ifdef in the middle
dadfcdab
OD
1029 * of macro arguments
1030 */
35e45816
AD
1031 const char *older = "older than client. Consider upgrading server";
1032 const char *newer = "newer than client. Consider recompiling application";
d7e09d03 1033
2d00bd17 1034 LCONSOLE_WARN("Server %s version (%d.%d.%d.%d) is much %s (%s)\n",
d7e09d03
PT
1035 obd2cli_tgt(imp->imp_obd),
1036 OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
1037 OBD_OCD_VERSION_MINOR(ocd->ocd_version),
1038 OBD_OCD_VERSION_PATCH(ocd->ocd_version),
1039 OBD_OCD_VERSION_FIX(ocd->ocd_version),
1040 ocd->ocd_version > LUSTRE_VERSION_CODE ?
1041 newer : older, LUSTRE_VERSION_STRING);
35e45816 1042 warned = true;
d7e09d03
PT
1043 }
1044
1045#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0)
1046 /* Check if server has LU-1252 fix applied to not always swab
1047 * the IR MNE entries. Do this only once per connection. This
1048 * fixup is version-limited, because we don't want to carry the
1049 * OBD_CONNECT_MNE_SWAB flag around forever, just so long as we
1050 * need interop with unpatched 2.2 servers. For newer servers,
dadfcdab
OD
1051 * the client will do MNE swabbing only as needed. LU-1644
1052 */
d7e09d03
PT
1053 if (unlikely((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
1054 !(ocd->ocd_connect_flags & OBD_CONNECT_MNE_SWAB) &&
1055 OBD_OCD_VERSION_MAJOR(ocd->ocd_version) == 2 &&
1056 OBD_OCD_VERSION_MINOR(ocd->ocd_version) == 2 &&
1057 OBD_OCD_VERSION_PATCH(ocd->ocd_version) < 55 &&
1058 strcmp(imp->imp_obd->obd_type->typ_name,
1059 LUSTRE_MGC_NAME) == 0))
1060 imp->imp_need_mne_swab = 1;
1061 else /* clear if server was upgraded since last connect */
1062 imp->imp_need_mne_swab = 0;
1063#else
1064#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab"
1065#endif
1066
1067 if (ocd->ocd_connect_flags & OBD_CONNECT_CKSUM) {
1068 /* We sent to the server ocd_cksum_types with bits set
1069 * for algorithms we understand. The server masked off
dadfcdab
OD
1070 * the checksum types it doesn't support
1071 */
d7e09d03
PT
1072 if ((ocd->ocd_cksum_types &
1073 cksum_types_supported_client()) == 0) {
2d00bd17 1074 LCONSOLE_WARN("The negotiation of the checksum algorithm to use with server %s failed (%x/%x), disabling checksums\n",
d7e09d03
PT
1075 obd2cli_tgt(imp->imp_obd),
1076 ocd->ocd_cksum_types,
1077 cksum_types_supported_client());
1078 cli->cl_checksum = 0;
1079 cli->cl_supp_cksum_types = OBD_CKSUM_ADLER;
1080 } else {
1081 cli->cl_supp_cksum_types = ocd->ocd_cksum_types;
1082 }
1083 } else {
1084 /* The server does not support OBD_CONNECT_CKSUM.
dadfcdab
OD
1085 * Enforce ADLER for backward compatibility
1086 */
d7e09d03
PT
1087 cli->cl_supp_cksum_types = OBD_CKSUM_ADLER;
1088 }
b533ff4b 1089 cli->cl_cksum_type = cksum_type_select(cli->cl_supp_cksum_types);
d7e09d03
PT
1090
1091 if (ocd->ocd_connect_flags & OBD_CONNECT_BRW_SIZE)
1092 cli->cl_max_pages_per_rpc =
09cbfeaf 1093 min(ocd->ocd_brw_size >> PAGE_SHIFT,
d7e09d03
PT
1094 cli->cl_max_pages_per_rpc);
1095 else if (imp->imp_connect_op == MDS_CONNECT ||
1096 imp->imp_connect_op == MGS_CONNECT)
1097 cli->cl_max_pages_per_rpc = 1;
1098
1099 /* Reset ns_connect_flags only for initial connect. It might be
1100 * changed in while using FS and if we reset it in reconnect
1101 * this leads to losing user settings done before such as
dadfcdab
OD
1102 * disable lru_resize, etc.
1103 */
d7e09d03
PT
1104 if (old_connect_flags != exp_connect_flags(exp) ||
1105 aa->pcaa_initial_connect) {
55f5a824
GKH
1106 CDEBUG(D_HA, "%s: Resetting ns_connect_flags to server flags: %#llx\n",
1107 imp->imp_obd->obd_name, ocd->ocd_connect_flags);
d7e09d03
PT
1108 imp->imp_obd->obd_namespace->ns_connect_flags =
1109 ocd->ocd_connect_flags;
1110 imp->imp_obd->obd_namespace->ns_orig_connect_flags =
1111 ocd->ocd_connect_flags;
1112 }
1113
1114 if ((ocd->ocd_connect_flags & OBD_CONNECT_AT) &&
1115 (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2))
1116 /* We need a per-message support flag, because
dadfcdab
OD
1117 * a. we don't know if the incoming connect reply
1118 * supports AT or not (in reply_in_callback)
1119 * until we unpack it.
1120 * b. failovered server means export and flags are gone
1121 * (in ptlrpc_send_reply).
1122 * Can only be set when we know AT is supported at
1123 * both ends
1124 */
d7e09d03
PT
1125 imp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT;
1126 else
1127 imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
1128
1129 if ((ocd->ocd_connect_flags & OBD_CONNECT_FULL20) &&
1130 (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2))
1131 imp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18;
1132 else
1133 imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
1134
1135 LASSERT((cli->cl_max_pages_per_rpc <= PTLRPC_MAX_BRW_PAGES) &&
1136 (cli->cl_max_pages_per_rpc > 0));
3147b268 1137 client_adjust_max_dirty(cli);
d7e09d03
PT
1138 }
1139
1140out:
1141 imp->imp_connect_tried = 1;
1142
1143 if (rc != 0) {
1144 IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
1145 if (rc == -EACCES) {
1146 /*
1147 * Give up trying to reconnect
1148 * EACCES means client has no permission for connection
1149 */
1150 imp->imp_obd->obd_no_recov = 1;
1151 ptlrpc_deactivate_import(imp);
1152 }
1153
1154 if (rc == -EPROTO) {
1155 struct obd_connect_data *ocd;
1156
1157 /* reply message might not be ready */
8b382089 1158 if (!request->rq_repmsg)
0a3bdb00 1159 return -EPROTO;
d7e09d03
PT
1160
1161 ocd = req_capsule_server_get(&request->rq_pill,
1162 &RMF_CONNECT_DATA);
1163 if (ocd &&
1164 (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
1165 (ocd->ocd_version != LUSTRE_VERSION_CODE)) {
532118c0
KM
1166 /*
1167 * Actually servers are only supposed to refuse
1168 * connection from liblustre clients, so we
1169 * should never see this from VFS context
1170 */
2d00bd17
JP
1171 LCONSOLE_ERROR_MSG(0x16a, "Server %s version (%d.%d.%d.%d) refused connection from this client with an incompatible version (%s). Client must be recompiled\n",
1172 obd2cli_tgt(imp->imp_obd),
1173 OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
1174 OBD_OCD_VERSION_MINOR(ocd->ocd_version),
1175 OBD_OCD_VERSION_PATCH(ocd->ocd_version),
1176 OBD_OCD_VERSION_FIX(ocd->ocd_version),
1177 LUSTRE_VERSION_STRING);
d7e09d03
PT
1178 ptlrpc_deactivate_import(imp);
1179 IMPORT_SET_STATE(imp, LUSTRE_IMP_CLOSED);
1180 }
0a3bdb00 1181 return -EPROTO;
d7e09d03
PT
1182 }
1183
1184 ptlrpc_maybe_ping_import_soon(imp);
1185
1186 CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n",
1187 obd2cli_tgt(imp->imp_obd),
1188 (char *)imp->imp_connection->c_remote_uuid.uuid, rc);
1189 }
1190
1191 wake_up_all(&imp->imp_recovery_waitq);
0a3bdb00 1192 return rc;
d7e09d03
PT
1193}
1194
1195/**
1196 * interpret callback for "completed replay" RPCs.
1197 * \see signal_completed_replay
1198 */
1199static int completed_replay_interpret(const struct lu_env *env,
1200 struct ptlrpc_request *req,
aff9d8e8 1201 void *data, int rc)
d7e09d03 1202{
d7e09d03
PT
1203 atomic_dec(&req->rq_import->imp_replay_inflight);
1204 if (req->rq_status == 0 &&
1205 !req->rq_import->imp_vbr_failed) {
1206 ptlrpc_import_recovery_state_machine(req->rq_import);
1207 } else {
1208 if (req->rq_import->imp_vbr_failed) {
1209 CDEBUG(D_WARNING,
1210 "%s: version recovery fails, reconnecting\n",
1211 req->rq_import->imp_obd->obd_name);
1212 } else {
2d00bd17 1213 CDEBUG(D_HA, "%s: LAST_REPLAY message error: %d, reconnecting\n",
d7e09d03
PT
1214 req->rq_import->imp_obd->obd_name,
1215 req->rq_status);
1216 }
1217 ptlrpc_connect_import(req->rq_import);
1218 }
1219
0a3bdb00 1220 return 0;
d7e09d03
PT
1221}
1222
1223/**
1224 * Let server know that we have no requests to replay anymore.
1225 * Achieved by just sending a PING request
1226 */
1227static int signal_completed_replay(struct obd_import *imp)
1228{
1229 struct ptlrpc_request *req;
d7e09d03
PT
1230
1231 if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_FINISH_REPLAY)))
0a3bdb00 1232 return 0;
d7e09d03
PT
1233
1234 LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
1235 atomic_inc(&imp->imp_replay_inflight);
1236
1237 req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, LUSTRE_OBD_VERSION,
1238 OBD_PING);
8b382089 1239 if (!req) {
d7e09d03 1240 atomic_dec(&imp->imp_replay_inflight);
0a3bdb00 1241 return -ENOMEM;
d7e09d03
PT
1242 }
1243
1244 ptlrpc_request_set_replen(req);
1245 req->rq_send_state = LUSTRE_IMP_REPLAY_WAIT;
1246 lustre_msg_add_flags(req->rq_reqmsg,
1247 MSG_LOCK_REPLAY_DONE | MSG_REQ_REPLAY_DONE);
1248 if (AT_OFF)
1249 req->rq_timeout *= 3;
1250 req->rq_interpret_reply = completed_replay_interpret;
1251
c5c4c6fa 1252 ptlrpcd_add_req(req);
0a3bdb00 1253 return 0;
d7e09d03
PT
1254}
1255
1256/**
1257 * In kernel code all import invalidation happens in its own
1258 * separate thread, so that whatever application happened to encounter
1259 * a problem could still be killed or otherwise continue
1260 */
1261static int ptlrpc_invalidate_import_thread(void *data)
1262{
1263 struct obd_import *imp = data;
1264
d7e09d03
PT
1265 unshare_fs_struct();
1266
1267 CDEBUG(D_HA, "thread invalidate import %s to %s@%s\n",
1268 imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
1269 imp->imp_connection->c_remote_uuid.uuid);
1270
1271 ptlrpc_invalidate_import(imp);
1272
1273 if (obd_dump_on_eviction) {
1274 CERROR("dump the log upon eviction\n");
1275 libcfs_debug_dumplog();
1276 }
1277
1278 IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
1279 ptlrpc_import_recovery_state_machine(imp);
1280
1281 class_import_put(imp);
0a3bdb00 1282 return 0;
d7e09d03
PT
1283}
1284
1285/**
1286 * This is the state machine for client-side recovery on import.
1287 *
b6da17f3 1288 * Typically we have two possibly paths. If we came to server and it is not
d7e09d03
PT
1289 * in recovery, we just enter IMP_EVICTED state, invalidate our import
1290 * state and reconnect from scratch.
1291 * If we came to server that is in recovery, we enter IMP_REPLAY import state.
1292 * We go through our list of requests to replay and send them to server one by
1293 * one.
1294 * After sending all request from the list we change import state to
1295 * IMP_REPLAY_LOCKS and re-request all the locks we believe we have from server
1296 * and also all the locks we don't yet have and wait for server to grant us.
1297 * After that we send a special "replay completed" request and change import
1298 * state to IMP_REPLAY_WAIT.
1299 * Upon receiving reply to that "replay completed" RPC we enter IMP_RECOVER
1300 * state and resend all requests from sending list.
1301 * After that we promote import to FULL state and send all delayed requests
1302 * and import is fully operational after that.
1303 *
1304 */
1305int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
1306{
1307 int rc = 0;
1308 int inflight;
1309 char *target_start;
1310 int target_len;
1311
d7e09d03
PT
1312 if (imp->imp_state == LUSTRE_IMP_EVICTED) {
1313 deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
1314 &target_start, &target_len);
1315 /* Don't care about MGC eviction */
1316 if (strcmp(imp->imp_obd->obd_type->typ_name,
1317 LUSTRE_MGC_NAME) != 0) {
2d00bd17 1318 LCONSOLE_ERROR_MSG(0x167, "%s: This client was evicted by %.*s; in progress operations using this service will fail.\n",
d7e09d03
PT
1319 imp->imp_obd->obd_name, target_len,
1320 target_start);
1321 }
1322 CDEBUG(D_HA, "evicted from %s@%s; invalidating\n",
1323 obd2cli_tgt(imp->imp_obd),
1324 imp->imp_connection->c_remote_uuid.uuid);
1325 /* reset vbr_failed flag upon eviction */
1326 spin_lock(&imp->imp_lock);
1327 imp->imp_vbr_failed = 0;
1328 spin_unlock(&imp->imp_lock);
1329
1330 {
68b636b6 1331 struct task_struct *task;
d7e09d03 1332 /* bug 17802: XXX client_disconnect_export vs connect request
9c379663 1333 * race. if client is evicted at this time, we start
d7e09d03 1334 * invalidate thread without reference to import and import can
dadfcdab
OD
1335 * be freed at same time.
1336 */
d7e09d03
PT
1337 class_import_get(imp);
1338 task = kthread_run(ptlrpc_invalidate_import_thread, imp,
30c0aa39 1339 "ll_imp_inval");
d7e09d03
PT
1340 if (IS_ERR(task)) {
1341 class_import_put(imp);
1342 CERROR("error starting invalidate thread: %d\n", rc);
1343 rc = PTR_ERR(task);
1344 } else {
1345 rc = 0;
1346 }
0a3bdb00 1347 return rc;
d7e09d03
PT
1348 }
1349 }
1350
1351 if (imp->imp_state == LUSTRE_IMP_REPLAY) {
1352 CDEBUG(D_HA, "replay requested by %s\n",
1353 obd2cli_tgt(imp->imp_obd));
1354 rc = ptlrpc_replay_next(imp, &inflight);
1355 if (inflight == 0 &&
1356 atomic_read(&imp->imp_replay_inflight) == 0) {
1357 IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
1358 rc = ldlm_replay_locks(imp);
1359 if (rc)
a9b3e8f3 1360 goto out;
d7e09d03
PT
1361 }
1362 rc = 0;
1363 }
1364
1365 if (imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS) {
1366 if (atomic_read(&imp->imp_replay_inflight) == 0) {
1367 IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_WAIT);
1368 rc = signal_completed_replay(imp);
1369 if (rc)
a9b3e8f3 1370 goto out;
d7e09d03 1371 }
d7e09d03
PT
1372 }
1373
1374 if (imp->imp_state == LUSTRE_IMP_REPLAY_WAIT) {
1375 if (atomic_read(&imp->imp_replay_inflight) == 0) {
1376 IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
1377 }
1378 }
1379
1380 if (imp->imp_state == LUSTRE_IMP_RECOVER) {
1381 CDEBUG(D_HA, "reconnected to %s@%s\n",
1382 obd2cli_tgt(imp->imp_obd),
1383 imp->imp_connection->c_remote_uuid.uuid);
1384
1385 rc = ptlrpc_resend(imp);
1386 if (rc)
a9b3e8f3 1387 goto out;
d7e09d03
PT
1388 IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
1389 ptlrpc_activate_import(imp);
1390
1391 deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
1392 &target_start, &target_len);
1393 LCONSOLE_INFO("%s: Connection restored to %.*s (at %s)\n",
1394 imp->imp_obd->obd_name,
1395 target_len, target_start,
1396 libcfs_nid2str(imp->imp_connection->c_peer.nid));
1397 }
1398
1399 if (imp->imp_state == LUSTRE_IMP_FULL) {
1400 wake_up_all(&imp->imp_recovery_waitq);
1401 ptlrpc_wake_delayed(imp);
1402 }
1403
1404out:
0a3bdb00 1405 return rc;
d7e09d03
PT
1406}
1407
1408int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
1409{
1410 struct ptlrpc_request *req;
1411 int rq_opc, rc = 0;
d7e09d03 1412
88291a7a 1413 if (imp->imp_obd->obd_force)
a9b3e8f3 1414 goto set_state;
d7e09d03
PT
1415
1416 switch (imp->imp_connect_op) {
88291a7a
AD
1417 case OST_CONNECT:
1418 rq_opc = OST_DISCONNECT;
1419 break;
1420 case MDS_CONNECT:
1421 rq_opc = MDS_DISCONNECT;
1422 break;
1423 case MGS_CONNECT:
1424 rq_opc = MGS_DISCONNECT;
1425 break;
d7e09d03 1426 default:
88291a7a 1427 rc = -EINVAL;
2d00bd17 1428 CERROR("%s: don't know how to disconnect from %s (connect_op %d): rc = %d\n",
88291a7a
AD
1429 imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
1430 imp->imp_connect_op, rc);
1431 return rc;
d7e09d03
PT
1432 }
1433
1434 if (ptlrpc_import_in_recovery(imp)) {
1435 struct l_wait_info lwi;
b2d201bd 1436 long timeout;
d7e09d03 1437
d7e09d03
PT
1438 if (AT_OFF) {
1439 if (imp->imp_server_timeout)
1440 timeout = cfs_time_seconds(obd_timeout / 2);
1441 else
1442 timeout = cfs_time_seconds(obd_timeout);
1443 } else {
1444 int idx = import_at_get_index(imp,
1445 imp->imp_client->cli_request_portal);
1446 timeout = cfs_time_seconds(
1447 at_get(&imp->imp_at.iat_service_estimate[idx]));
1448 }
1449
1450 lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout),
1451 back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL);
1452 rc = l_wait_event(imp->imp_recovery_waitq,
1453 !ptlrpc_import_in_recovery(imp), &lwi);
d7e09d03
PT
1454 }
1455
1456 spin_lock(&imp->imp_lock);
1457 if (imp->imp_state != LUSTRE_IMP_FULL)
a9b3e8f3 1458 goto out;
d7e09d03
PT
1459 spin_unlock(&imp->imp_lock);
1460
1461 req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_DISCONNECT,
1462 LUSTRE_OBD_VERSION, rq_opc);
1463 if (req) {
1464 /* We are disconnecting, do not retry a failed DISCONNECT rpc if
1465 * it fails. We can get through the above with a down server
dadfcdab
OD
1466 * if the client doesn't know the server is gone yet.
1467 */
d7e09d03
PT
1468 req->rq_no_resend = 1;
1469
1470 /* We want client umounts to happen quickly, no matter the
dadfcdab
OD
1471 * server state...
1472 */
d7e09d03
PT
1473 req->rq_timeout = min_t(int, req->rq_timeout,
1474 INITIAL_CONNECT_TIMEOUT);
1475
1476 IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING);
d0bfef31 1477 req->rq_send_state = LUSTRE_IMP_CONNECTING;
d7e09d03
PT
1478 ptlrpc_request_set_replen(req);
1479 rc = ptlrpc_queue_wait(req);
1480 ptlrpc_req_finished(req);
1481 }
1482
1483set_state:
1484 spin_lock(&imp->imp_lock);
1485out:
1486 if (noclose)
1487 IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
1488 else
1489 IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
1490 memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle));
1491 spin_unlock(&imp->imp_lock);
1492
88291a7a
AD
1493 if (rc == -ETIMEDOUT || rc == -ENOTCONN || rc == -ESHUTDOWN)
1494 rc = 0;
1495
0a3bdb00 1496 return rc;
d7e09d03
PT
1497}
1498EXPORT_SYMBOL(ptlrpc_disconnect_import);
1499
d7e09d03
PT
1500/* Adaptive Timeout utils */
1501extern unsigned int at_min, at_max, at_history;
1502
31c5e95e
CH
1503/*
1504 *Update at_current with the specified value (bounded by at_min and at_max),
1505 * as well as the AT history "bins".
1506 * - Bin into timeslices using AT_BINS bins.
1507 * - This gives us a max of the last at_history seconds without the storage,
1508 * but still smoothing out a return to normalcy from a slow response.
1509 * - (E.g. remember the maximum latency in each minute of the last 4 minutes.)
dadfcdab 1510 */
d7e09d03
PT
1511int at_measured(struct adaptive_timeout *at, unsigned int val)
1512{
1513 unsigned int old = at->at_current;
0ac0478b
AB
1514 time64_t now = ktime_get_real_seconds();
1515 long binlimit = max_t(long, at_history / AT_BINS, 1);
d7e09d03
PT
1516
1517 LASSERT(at);
1518 CDEBUG(D_OTHER, "add %u to %p time=%lu v=%u (%u %u %u %u)\n",
0ac0478b 1519 val, at, (long)(now - at->at_binstart), at->at_current,
d7e09d03
PT
1520 at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]);
1521
1522 if (val == 0)
1523 /* 0's don't count, because we never want our timeout to
dadfcdab
OD
1524 * drop to 0, and because 0 could mean an error
1525 */
d7e09d03
PT
1526 return 0;
1527
1528 spin_lock(&at->at_lock);
1529
1530 if (unlikely(at->at_binstart == 0)) {
1531 /* Special case to remove default from history */
1532 at->at_current = val;
1533 at->at_worst_ever = val;
1534 at->at_worst_time = now;
1535 at->at_hist[0] = val;
1536 at->at_binstart = now;
3949015e 1537 } else if (now - at->at_binstart < binlimit) {
d7e09d03
PT
1538 /* in bin 0 */
1539 at->at_hist[0] = max(val, at->at_hist[0]);
1540 at->at_current = max(val, at->at_current);
1541 } else {
1542 int i, shift;
1543 unsigned int maxv = val;
1544 /* move bins over */
0ac0478b 1545 shift = (u32)(now - at->at_binstart) / binlimit;
d7e09d03 1546 LASSERT(shift > 0);
3949015e 1547 for (i = AT_BINS - 1; i >= 0; i--) {
d7e09d03
PT
1548 if (i >= shift) {
1549 at->at_hist[i] = at->at_hist[i - shift];
1550 maxv = max(maxv, at->at_hist[i]);
1551 } else {
1552 at->at_hist[i] = 0;
1553 }
1554 }
1555 at->at_hist[0] = val;
1556 at->at_current = maxv;
1557 at->at_binstart += shift * binlimit;
1558 }
1559
1560 if (at->at_current > at->at_worst_ever) {
1561 at->at_worst_ever = at->at_current;
1562 at->at_worst_time = now;
1563 }
1564
1565 if (at->at_flags & AT_FLG_NOHIST)
1566 /* Only keep last reported val; keeping the rest of the history
dadfcdab
OD
1567 * for debugfs only
1568 */
d7e09d03
PT
1569 at->at_current = val;
1570
1571 if (at_max > 0)
1572 at->at_current = min(at->at_current, at_max);
1573 at->at_current = max(at->at_current, at_min);
1574
1575 if (at->at_current != old)
2d00bd17
JP
1576 CDEBUG(D_OTHER, "AT %p change: old=%u new=%u delta=%d (val=%u) hist %u %u %u %u\n",
1577 at,
d7e09d03
PT
1578 old, at->at_current, at->at_current - old, val,
1579 at->at_hist[0], at->at_hist[1], at->at_hist[2],
1580 at->at_hist[3]);
1581
1582 /* if we changed, report the old value */
1583 old = (at->at_current != old) ? old : 0;
1584
1585 spin_unlock(&at->at_lock);
1586 return old;
1587}
1588
1589/* Find the imp_at index for a given portal; assign if space available */
1590int import_at_get_index(struct obd_import *imp, int portal)
1591{
1592 struct imp_at *at = &imp->imp_at;
1593 int i;
1594
1595 for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
1596 if (at->iat_portal[i] == portal)
1597 return i;
1598 if (at->iat_portal[i] == 0)
1599 /* unused */
1600 break;
1601 }
1602
1603 /* Not found in list, add it under a lock */
1604 spin_lock(&imp->imp_lock);
1605
1606 /* Check unused under lock */
1607 for (; i < IMP_AT_MAX_PORTALS; i++) {
1608 if (at->iat_portal[i] == portal)
1609 goto out;
1610 if (at->iat_portal[i] == 0)
1611 /* unused */
1612 break;
1613 }
1614
1615 /* Not enough portals? */
1616 LASSERT(i < IMP_AT_MAX_PORTALS);
1617
1618 at->iat_portal[i] = portal;
1619out:
1620 spin_unlock(&imp->imp_lock);
1621 return i;
1622}
This page took 0.628763 seconds and 5 git commands to generate.