Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma
[deliverable/linux.git] / drivers / staging / lustre / lustre / ptlrpc / import.c
CommitLineData
d7e09d03
PT
1/*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19 *
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
22 * have any questions.
23 *
24 * GPL HEADER END
25 */
26/*
27 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
29 *
1dc563a6 30 * Copyright (c) 2011, 2015, Intel Corporation.
d7e09d03
PT
31 */
32/*
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
35 *
36 * lustre/ptlrpc/import.c
37 *
38 * Author: Mike Shaver <shaver@clusterfs.com>
39 */
40
41#define DEBUG_SUBSYSTEM S_RPC
42
e27db149
GKH
43#include "../include/obd_support.h"
44#include "../include/lustre_ha.h"
45#include "../include/lustre_net.h"
46#include "../include/lustre_import.h"
47#include "../include/lustre_export.h"
48#include "../include/obd.h"
49#include "../include/obd_cksum.h"
50#include "../include/obd_class.h"
d7e09d03
PT
51
52#include "ptlrpc_internal.h"
53
54struct ptlrpc_connect_async_args {
55 __u64 pcaa_peer_committed;
56 int pcaa_initial_connect;
57};
58
59/**
60 * Updates import \a imp current state to provided \a state value
61 * Helper function. Must be called under imp_lock.
62 */
63static void __import_set_state(struct obd_import *imp,
64 enum lustre_imp_state state)
65{
502cb58e
AS
66 switch (state) {
67 case LUSTRE_IMP_CLOSED:
68 case LUSTRE_IMP_NEW:
69 case LUSTRE_IMP_DISCON:
70 case LUSTRE_IMP_CONNECTING:
71 break;
72 case LUSTRE_IMP_REPLAY_WAIT:
73 imp->imp_replay_state = LUSTRE_IMP_REPLAY_LOCKS;
74 break;
75 default:
76 imp->imp_replay_state = LUSTRE_IMP_REPLAY;
77 }
78
d7e09d03
PT
79 imp->imp_state = state;
80 imp->imp_state_hist[imp->imp_state_hist_idx].ish_state = state;
81 imp->imp_state_hist[imp->imp_state_hist_idx].ish_time =
74e489aa 82 ktime_get_real_seconds();
d7e09d03
PT
83 imp->imp_state_hist_idx = (imp->imp_state_hist_idx + 1) %
84 IMP_STATE_HIST_LEN;
85}
86
87/* A CLOSED import should remain so. */
532118c0
KM
88#define IMPORT_SET_STATE_NOLOCK(imp, state) \
89do { \
90 if (imp->imp_state != LUSTRE_IMP_CLOSED) { \
91 CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n", \
92 imp, obd2cli_tgt(imp->imp_obd), \
93 ptlrpc_import_state_name(imp->imp_state), \
94 ptlrpc_import_state_name(state)); \
95 __import_set_state(imp, state); \
96 } \
3949015e 97} while (0)
d7e09d03
PT
98
99#define IMPORT_SET_STATE(imp, state) \
100do { \
101 spin_lock(&imp->imp_lock); \
102 IMPORT_SET_STATE_NOLOCK(imp, state); \
103 spin_unlock(&imp->imp_lock); \
3949015e 104} while (0)
d7e09d03 105
d7e09d03
PT
106static int ptlrpc_connect_interpret(const struct lu_env *env,
107 struct ptlrpc_request *request,
aff9d8e8 108 void *data, int rc);
d7e09d03
PT
109int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
110
111/* Only this function is allowed to change the import state when it is
112 * CLOSED. I would rather refcount the import and free it after
113 * disconnection like we do with exports. To do that, the client_obd
114 * will need to save the peer info somewhere other than in the import,
dadfcdab
OD
115 * though.
116 */
d7e09d03
PT
117int ptlrpc_init_import(struct obd_import *imp)
118{
119 spin_lock(&imp->imp_lock);
120
121 imp->imp_generation++;
d0bfef31 122 imp->imp_state = LUSTRE_IMP_NEW;
d7e09d03
PT
123
124 spin_unlock(&imp->imp_lock);
125
126 return 0;
127}
128EXPORT_SYMBOL(ptlrpc_init_import);
129
130#define UUID_STR "_UUID"
9dd7d427
SB
131static void deuuidify(char *uuid, const char *prefix, char **uuid_start,
132 int *uuid_len)
d7e09d03
PT
133{
134 *uuid_start = !prefix || strncmp(uuid, prefix, strlen(prefix))
135 ? uuid : uuid + strlen(prefix);
136
137 *uuid_len = strlen(*uuid_start);
138
139 if (*uuid_len < strlen(UUID_STR))
140 return;
141
142 if (!strncmp(*uuid_start + *uuid_len - strlen(UUID_STR),
30c0aa39 143 UUID_STR, strlen(UUID_STR)))
d7e09d03
PT
144 *uuid_len -= strlen(UUID_STR);
145}
d7e09d03
PT
146
147/**
148 * Returns true if import was FULL, false if import was already not
149 * connected.
150 * @imp - import to be disconnected
151 * @conn_cnt - connection count (epoch) of the request that timed out
152 * and caused the disconnection. In some cases, multiple
153 * inflight requests can fail to a single target (e.g. OST
154 * bulk requests) and if one has already caused a reconnection
155 * (increasing the import->conn_cnt) the older failure should
156 * not also cause a reconnection. If zero it forces a reconnect.
157 */
158int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt)
159{
160 int rc = 0;
161
162 spin_lock(&imp->imp_lock);
163
164 if (imp->imp_state == LUSTRE_IMP_FULL &&
165 (conn_cnt == 0 || conn_cnt == imp->imp_conn_cnt)) {
166 char *target_start;
167 int target_len;
168
169 deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
170 &target_start, &target_len);
171
172 if (imp->imp_replayable) {
2d00bd17
JP
173 LCONSOLE_WARN("%s: Connection to %.*s (at %s) was lost; in progress operations using this service will wait for recovery to complete\n",
174 imp->imp_obd->obd_name, target_len, target_start,
175 libcfs_nid2str(imp->imp_connection->c_peer.nid));
d7e09d03 176 } else {
2d00bd17
JP
177 LCONSOLE_ERROR_MSG(0x166, "%s: Connection to %.*s (at %s) was lost; in progress operations using this service will fail\n",
178 imp->imp_obd->obd_name,
179 target_len, target_start,
180 libcfs_nid2str(imp->imp_connection->c_peer.nid));
d7e09d03 181 }
d7e09d03
PT
182 IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
183 spin_unlock(&imp->imp_lock);
184
185 if (obd_dump_on_timeout)
186 libcfs_debug_dumplog();
187
188 obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON);
189 rc = 1;
190 } else {
191 spin_unlock(&imp->imp_lock);
192 CDEBUG(D_HA, "%s: import %p already %s (conn %u, was %u): %s\n",
193 imp->imp_client->cli_name, imp,
194 (imp->imp_state == LUSTRE_IMP_FULL &&
195 imp->imp_conn_cnt > conn_cnt) ?
196 "reconnected" : "not connected", imp->imp_conn_cnt,
197 conn_cnt, ptlrpc_import_state_name(imp->imp_state));
198 }
199
200 return rc;
201}
202
441fda84
ML
203/*
204 * This acts as a barrier; all existing requests are rejected, and
205 * no new requests will be accepted until the import is valid again.
206 */
207void ptlrpc_deactivate_import(struct obd_import *imp)
d7e09d03 208{
d7e09d03 209 CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd));
441fda84
ML
210
211 spin_lock(&imp->imp_lock);
d7e09d03
PT
212 imp->imp_invalid = 1;
213 imp->imp_generation++;
214 spin_unlock(&imp->imp_lock);
215
216 ptlrpc_abort_inflight(imp);
217 obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
d7e09d03 218}
d7e09d03
PT
219EXPORT_SYMBOL(ptlrpc_deactivate_import);
220
221static unsigned int
219e6de6 222ptlrpc_inflight_deadline(struct ptlrpc_request *req, time64_t now)
d7e09d03
PT
223{
224 long dl;
225
226 if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
227 (req->rq_phase == RQ_PHASE_BULK) ||
228 (req->rq_phase == RQ_PHASE_NEW)))
229 return 0;
230
231 if (req->rq_timedout)
232 return 0;
233
234 if (req->rq_phase == RQ_PHASE_NEW)
235 dl = req->rq_sent;
236 else
237 dl = req->rq_deadline;
238
239 if (dl <= now)
240 return 0;
241
242 return dl - now;
243}
244
245static unsigned int ptlrpc_inflight_timeout(struct obd_import *imp)
246{
219e6de6 247 time64_t now = ktime_get_real_seconds();
d7e09d03
PT
248 struct list_head *tmp, *n;
249 struct ptlrpc_request *req;
250 unsigned int timeout = 0;
251
252 spin_lock(&imp->imp_lock);
253 list_for_each_safe(tmp, n, &imp->imp_sending_list) {
254 req = list_entry(tmp, struct ptlrpc_request, rq_list);
255 timeout = max(ptlrpc_inflight_deadline(req, now), timeout);
256 }
257 spin_unlock(&imp->imp_lock);
258 return timeout;
259}
260
261/**
262 * This function will invalidate the import, if necessary, then block
263 * for all the RPC completions, and finally notify the obd to
264 * invalidate its state (ie cancel locks, clear pending requests,
265 * etc).
266 */
267void ptlrpc_invalidate_import(struct obd_import *imp)
268{
269 struct list_head *tmp, *n;
270 struct ptlrpc_request *req;
271 struct l_wait_info lwi;
272 unsigned int timeout;
273 int rc;
274
275 atomic_inc(&imp->imp_inval_count);
276
277 if (!imp->imp_invalid || imp->imp_obd->obd_no_recov)
278 ptlrpc_deactivate_import(imp);
279
cca8fca1 280 CFS_FAIL_TIMEOUT(OBD_FAIL_MGS_CONNECT_NET, 3 * cfs_fail_val / 2);
d7e09d03
PT
281 LASSERT(imp->imp_invalid);
282
283 /* Wait forever until inflight == 0. We really can't do it another
284 * way because in some cases we need to wait for very long reply
285 * unlink. We can't do anything before that because there is really
dadfcdab
OD
286 * no guarantee that some rdma transfer is not in progress right now.
287 */
d7e09d03
PT
288 do {
289 /* Calculate max timeout for waiting on rpcs to error
290 * out. Use obd_timeout if calculated value is smaller
dadfcdab
OD
291 * than it.
292 */
d7e09d03
PT
293 if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
294 timeout = ptlrpc_inflight_timeout(imp);
295 timeout += timeout / 3;
296
297 if (timeout == 0)
298 timeout = obd_timeout;
299 } else {
300 /* decrease the interval to increase race condition */
301 timeout = 1;
302 }
303
1d8cb70c
GD
304 CDEBUG(D_RPCTRACE,
305 "Sleeping %d sec for inflight to error out\n",
d7e09d03
PT
306 timeout);
307
308 /* Wait for all requests to error out and call completion
309 * callbacks. Cap it at obd_timeout -- these should all
dadfcdab
OD
310 * have been locally cancelled by ptlrpc_abort_inflight.
311 */
d7e09d03
PT
312 lwi = LWI_TIMEOUT_INTERVAL(
313 cfs_timeout_cap(cfs_time_seconds(timeout)),
314 (timeout > 1)?cfs_time_seconds(1):cfs_time_seconds(1)/2,
315 NULL, NULL);
316 rc = l_wait_event(imp->imp_recovery_waitq,
317 (atomic_read(&imp->imp_inflight) == 0),
318 &lwi);
319 if (rc) {
320 const char *cli_tgt = obd2cli_tgt(imp->imp_obd);
321
322 CERROR("%s: rc = %d waiting for callback (%d != 0)\n",
323 cli_tgt, rc,
324 atomic_read(&imp->imp_inflight));
325
326 spin_lock(&imp->imp_lock);
327 if (atomic_read(&imp->imp_inflight) == 0) {
328 int count = atomic_read(&imp->imp_unregistering);
329
330 /* We know that "unregistering" rpcs only can
331 * survive in sending or delaying lists (they
332 * maybe waiting for long reply unlink in
333 * sluggish nets). Let's check this. If there
334 * is no inflight and unregistering != 0, this
dadfcdab
OD
335 * is bug.
336 */
2d00bd17
JP
337 LASSERTF(count == 0, "Some RPCs are still unregistering: %d\n",
338 count);
d7e09d03
PT
339
340 /* Let's save one loop as soon as inflight have
341 * dropped to zero. No new inflights possible at
dadfcdab
OD
342 * this point.
343 */
d7e09d03
PT
344 rc = 0;
345 } else {
346 list_for_each_safe(tmp, n,
30c0aa39 347 &imp->imp_sending_list) {
d7e09d03 348 req = list_entry(tmp,
30c0aa39
OD
349 struct ptlrpc_request,
350 rq_list);
d7e09d03
PT
351 DEBUG_REQ(D_ERROR, req,
352 "still on sending list");
353 }
354 list_for_each_safe(tmp, n,
30c0aa39 355 &imp->imp_delayed_list) {
d7e09d03 356 req = list_entry(tmp,
30c0aa39
OD
357 struct ptlrpc_request,
358 rq_list);
d7e09d03
PT
359 DEBUG_REQ(D_ERROR, req,
360 "still on delayed list");
361 }
362
2d00bd17
JP
363 CERROR("%s: RPCs in \"%s\" phase found (%d). Network is sluggish? Waiting them to error out.\n",
364 cli_tgt,
d7e09d03
PT
365 ptlrpc_phase2str(RQ_PHASE_UNREGISTERING),
366 atomic_read(&imp->
2d00bd17 367 imp_unregistering));
d7e09d03
PT
368 }
369 spin_unlock(&imp->imp_lock);
d0bfef31 370 }
d7e09d03
PT
371 } while (rc != 0);
372
373 /*
374 * Let's additionally check that no new rpcs added to import in
375 * "invalidate" state.
376 */
377 LASSERT(atomic_read(&imp->imp_inflight) == 0);
378 obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE);
379 sptlrpc_import_flush_all_ctx(imp);
380
381 atomic_dec(&imp->imp_inval_count);
382 wake_up_all(&imp->imp_recovery_waitq);
383}
384EXPORT_SYMBOL(ptlrpc_invalidate_import);
385
386/* unset imp_invalid */
387void ptlrpc_activate_import(struct obd_import *imp)
388{
389 struct obd_device *obd = imp->imp_obd;
390
391 spin_lock(&imp->imp_lock);
0b291b9a
HZ
392 if (imp->imp_deactive != 0) {
393 spin_unlock(&imp->imp_lock);
394 return;
395 }
396
d7e09d03 397 imp->imp_invalid = 0;
d7e09d03
PT
398 spin_unlock(&imp->imp_lock);
399 obd_import_event(obd, imp, IMP_EVENT_ACTIVE);
400}
401EXPORT_SYMBOL(ptlrpc_activate_import);
402
cca8fca1
AS
403static void ptlrpc_pinger_force(struct obd_import *imp)
404{
405 CDEBUG(D_HA, "%s: waking up pinger s:%s\n", obd2cli_tgt(imp->imp_obd),
406 ptlrpc_import_state_name(imp->imp_state));
407
408 spin_lock(&imp->imp_lock);
409 imp->imp_force_verify = 1;
410 spin_unlock(&imp->imp_lock);
411
412 if (imp->imp_state != LUSTRE_IMP_CONNECTING)
413 ptlrpc_pinger_wake_up();
414}
415
d7e09d03
PT
416void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt)
417{
d7e09d03
PT
418 LASSERT(!imp->imp_dlm_fake);
419
420 if (ptlrpc_set_import_discon(imp, conn_cnt)) {
421 if (!imp->imp_replayable) {
2d00bd17 422 CDEBUG(D_HA, "import %s@%s for %s not replayable, auto-deactivating\n",
d7e09d03
PT
423 obd2cli_tgt(imp->imp_obd),
424 imp->imp_connection->c_remote_uuid.uuid,
425 imp->imp_obd->obd_name);
426 ptlrpc_deactivate_import(imp);
427 }
428
cca8fca1 429 ptlrpc_pinger_force(imp);
d7e09d03 430 }
d7e09d03
PT
431}
432EXPORT_SYMBOL(ptlrpc_fail_import);
433
434int ptlrpc_reconnect_import(struct obd_import *imp)
435{
cca8fca1
AS
436 struct l_wait_info lwi;
437 int secs = cfs_time_seconds(obd_timeout);
438 int rc;
439
440 ptlrpc_pinger_force(imp);
441
442 CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n",
443 obd2cli_tgt(imp->imp_obd), secs);
444
445 lwi = LWI_TIMEOUT(secs, NULL, NULL);
446 rc = l_wait_event(imp->imp_recovery_waitq,
447 !ptlrpc_import_in_recovery(imp), &lwi);
448 CDEBUG(D_HA, "%s: recovery finished s:%s\n", obd2cli_tgt(imp->imp_obd),
449 ptlrpc_import_state_name(imp->imp_state));
450 return rc;
d7e09d03
PT
451}
452EXPORT_SYMBOL(ptlrpc_reconnect_import);
453
454/**
455 * Connection on import \a imp is changed to another one (if more than one is
456 * present). We typically chose connection that we have not tried to connect to
457 * the longest
458 */
459static int import_select_connection(struct obd_import *imp)
460{
461 struct obd_import_conn *imp_conn = NULL, *conn;
462 struct obd_export *dlmexp;
463 char *target_start;
464 int target_len, tried_all = 1;
d7e09d03
PT
465
466 spin_lock(&imp->imp_lock);
467
468 if (list_empty(&imp->imp_conn_list)) {
469 CERROR("%s: no connections available\n",
470 imp->imp_obd->obd_name);
471 spin_unlock(&imp->imp_lock);
0a3bdb00 472 return -EINVAL;
d7e09d03
PT
473 }
474
475 list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
b0f5aad5 476 CDEBUG(D_HA, "%s: connect to NID %s last attempt %llu\n",
d7e09d03
PT
477 imp->imp_obd->obd_name,
478 libcfs_nid2str(conn->oic_conn->c_peer.nid),
479 conn->oic_last_attempt);
480
481 /* If we have not tried this connection since
dadfcdab
OD
482 * the last successful attempt, go with this one
483 */
d7e09d03
PT
484 if ((conn->oic_last_attempt == 0) ||
485 cfs_time_beforeq_64(conn->oic_last_attempt,
30c0aa39 486 imp->imp_last_success_conn)) {
d7e09d03
PT
487 imp_conn = conn;
488 tried_all = 0;
489 break;
490 }
491
492 /* If all of the connections have already been tried
dadfcdab
OD
493 * since the last successful connection; just choose the
494 * least recently used
495 */
d7e09d03
PT
496 if (!imp_conn)
497 imp_conn = conn;
498 else if (cfs_time_before_64(conn->oic_last_attempt,
499 imp_conn->oic_last_attempt))
500 imp_conn = conn;
501 }
502
503 /* if not found, simply choose the current one */
504 if (!imp_conn || imp->imp_force_reconnect) {
505 LASSERT(imp->imp_conn_current);
506 imp_conn = imp->imp_conn_current;
507 tried_all = 0;
508 }
509 LASSERT(imp_conn->oic_conn);
510
511 /* If we've tried everything, and we're back to the beginning of the
dadfcdab
OD
512 * list, increase our timeout and try again. It will be reset when
513 * we do finally connect. (FIXME: really we should wait for all network
514 * state associated with the last connection attempt to drain before
515 * trying to reconnect on it.)
516 */
d7e09d03
PT
517 if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) {
518 struct adaptive_timeout *at = &imp->imp_at.iat_net_latency;
50ffcb7e 519
d7e09d03
PT
520 if (at_get(at) < CONNECTION_SWITCH_MAX) {
521 at_measured(at, at_get(at) + CONNECTION_SWITCH_INC);
522 if (at_get(at) > CONNECTION_SWITCH_MAX)
523 at_reset(at, CONNECTION_SWITCH_MAX);
524 }
525 LASSERT(imp_conn->oic_last_attempt);
2d00bd17
JP
526 CDEBUG(D_HA, "%s: tried all connections, increasing latency to %ds\n",
527 imp->imp_obd->obd_name, at_get(at));
d7e09d03
PT
528 }
529
530 imp_conn->oic_last_attempt = cfs_time_current_64();
531
532 /* switch connection, don't mind if it's same as the current one */
a5cb8880 533 ptlrpc_connection_put(imp->imp_connection);
d7e09d03
PT
534 imp->imp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
535
d0bfef31 536 dlmexp = class_conn2export(&imp->imp_dlm_handle);
a5cb8880 537 ptlrpc_connection_put(dlmexp->exp_connection);
d7e09d03
PT
538 dlmexp->exp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
539 class_export_put(dlmexp);
540
541 if (imp->imp_conn_current != imp_conn) {
542 if (imp->imp_conn_current) {
543 deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
544 &target_start, &target_len);
545
2d00bd17 546 CDEBUG(D_HA, "%s: Connection changing to %.*s (at %s)\n",
d7e09d03
PT
547 imp->imp_obd->obd_name,
548 target_len, target_start,
549 libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
550 }
551
552 imp->imp_conn_current = imp_conn;
553 }
554
555 CDEBUG(D_HA, "%s: import %p using connection %s/%s\n",
556 imp->imp_obd->obd_name, imp, imp_conn->oic_uuid.uuid,
557 libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
558
559 spin_unlock(&imp->imp_lock);
560
0a3bdb00 561 return 0;
d7e09d03
PT
562}
563
564/*
565 * must be called under imp_lock
566 */
567static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno)
568{
569 struct ptlrpc_request *req;
570 struct list_head *tmp;
571
63d42578 572 /* The requests in committed_list always have smaller transnos than
dadfcdab
OD
573 * the requests in replay_list
574 */
63d42578
HZ
575 if (!list_empty(&imp->imp_committed_list)) {
576 tmp = imp->imp_committed_list.next;
577 req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
578 *transno = req->rq_transno;
579 if (req->rq_transno == 0) {
580 DEBUG_REQ(D_ERROR, req,
581 "zero transno in committed_list");
582 LBUG();
583 }
584 return 1;
d7e09d03 585 }
63d42578
HZ
586 if (!list_empty(&imp->imp_replay_list)) {
587 tmp = imp->imp_replay_list.next;
588 req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
589 *transno = req->rq_transno;
590 if (req->rq_transno == 0) {
591 DEBUG_REQ(D_ERROR, req, "zero transno in replay_list");
592 LBUG();
593 }
594 return 1;
595 }
596 return 0;
d7e09d03
PT
597}
598
599/**
600 * Attempt to (re)connect import \a imp. This includes all preparations,
601 * initializing CONNECT RPC request and passing it to ptlrpcd for
602 * actual sending.
603 * Returns 0 on success or error code.
604 */
605int ptlrpc_connect_import(struct obd_import *imp)
606{
607 struct obd_device *obd = imp->imp_obd;
608 int initial_connect = 0;
609 int set_transno = 0;
610 __u64 committed_before_reconnect = 0;
611 struct ptlrpc_request *request;
612 char *bufs[] = { NULL,
613 obd2cli_tgt(imp->imp_obd),
614 obd->obd_uuid.uuid,
615 (char *)&imp->imp_dlm_handle,
616 (char *)&imp->imp_connect_data };
617 struct ptlrpc_connect_async_args *aa;
618 int rc;
d7e09d03
PT
619
620 spin_lock(&imp->imp_lock);
621 if (imp->imp_state == LUSTRE_IMP_CLOSED) {
622 spin_unlock(&imp->imp_lock);
623 CERROR("can't connect to a closed import\n");
0a3bdb00 624 return -EINVAL;
d7e09d03
PT
625 } else if (imp->imp_state == LUSTRE_IMP_FULL) {
626 spin_unlock(&imp->imp_lock);
627 CERROR("already connected\n");
0a3bdb00 628 return 0;
d7e09d03
PT
629 } else if (imp->imp_state == LUSTRE_IMP_CONNECTING) {
630 spin_unlock(&imp->imp_lock);
631 CERROR("already connecting\n");
0a3bdb00 632 return -EALREADY;
d7e09d03
PT
633 }
634
635 IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CONNECTING);
636
637 imp->imp_conn_cnt++;
638 imp->imp_resend_replay = 0;
639
640 if (!lustre_handle_is_used(&imp->imp_remote_handle))
641 initial_connect = 1;
642 else
643 committed_before_reconnect = imp->imp_peer_committed_transno;
644
645 set_transno = ptlrpc_first_transno(imp,
646 &imp->imp_connect_data.ocd_transno);
647 spin_unlock(&imp->imp_lock);
648
649 rc = import_select_connection(imp);
650 if (rc)
a9b3e8f3 651 goto out;
d7e09d03 652
5bcfab13 653 rc = sptlrpc_import_sec_adapt(imp, NULL, NULL);
d7e09d03 654 if (rc)
a9b3e8f3 655 goto out;
d7e09d03
PT
656
657 /* Reset connect flags to the originally requested flags, in case
dadfcdab
OD
658 * the server is updated on-the-fly we will get the new features.
659 */
d7e09d03
PT
660 imp->imp_connect_data.ocd_connect_flags = imp->imp_connect_flags_orig;
661 /* Reset ocd_version each time so the server knows the exact versions */
662 imp->imp_connect_data.ocd_version = LUSTRE_VERSION_CODE;
663 imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
664 imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
665
666 rc = obd_reconnect(NULL, imp->imp_obd->obd_self_export, obd,
667 &obd->obd_uuid, &imp->imp_connect_data, NULL);
668 if (rc)
a9b3e8f3 669 goto out;
d7e09d03
PT
670
671 request = ptlrpc_request_alloc(imp, &RQF_MDS_CONNECT);
8b382089 672 if (!request) {
a9b3e8f3
JL
673 rc = -ENOMEM;
674 goto out;
675 }
d7e09d03
PT
676
677 rc = ptlrpc_request_bufs_pack(request, LUSTRE_OBD_VERSION,
678 imp->imp_connect_op, bufs, NULL);
679 if (rc) {
680 ptlrpc_request_free(request);
a9b3e8f3 681 goto out;
d7e09d03
PT
682 }
683
684 /* Report the rpc service time to the server so that it knows how long
dadfcdab
OD
685 * to wait for clients to join recovery
686 */
d7e09d03
PT
687 lustre_msg_set_service_time(request->rq_reqmsg,
688 at_timeout2est(request->rq_timeout));
689
690 /* The amount of time we give the server to process the connect req.
691 * import_select_connection will increase the net latency on
692 * repeated reconnect attempts to cover slow networks.
693 * We override/ignore the server rpc completion estimate here,
dadfcdab
OD
694 * which may be large if this is a reconnect attempt
695 */
d7e09d03
PT
696 request->rq_timeout = INITIAL_CONNECT_TIMEOUT;
697 lustre_msg_set_timeout(request->rq_reqmsg, request->rq_timeout);
698
699 lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_NEXT_VER);
700
701 request->rq_no_resend = request->rq_no_delay = 1;
702 request->rq_send_state = LUSTRE_IMP_CONNECTING;
703 /* Allow a slightly larger reply for future growth compatibility */
704 req_capsule_set_size(&request->rq_pill, &RMF_CONNECT_DATA, RCL_SERVER,
705 sizeof(struct obd_connect_data)+16*sizeof(__u64));
706 ptlrpc_request_set_replen(request);
707 request->rq_interpret_reply = ptlrpc_connect_interpret;
708
3949015e 709 CLASSERT(sizeof(*aa) <= sizeof(request->rq_async_args));
d7e09d03 710 aa = ptlrpc_req_async_args(request);
ec83e611 711 memset(aa, 0, sizeof(*aa));
d7e09d03
PT
712
713 aa->pcaa_peer_committed = committed_before_reconnect;
714 aa->pcaa_initial_connect = initial_connect;
715
716 if (aa->pcaa_initial_connect) {
717 spin_lock(&imp->imp_lock);
718 imp->imp_replayable = 1;
719 spin_unlock(&imp->imp_lock);
720 lustre_msg_add_op_flags(request->rq_reqmsg,
721 MSG_CONNECT_INITIAL);
722 }
723
724 if (set_transno)
725 lustre_msg_add_op_flags(request->rq_reqmsg,
726 MSG_CONNECT_TRANSNO);
727
728 DEBUG_REQ(D_RPCTRACE, request, "(re)connect request (timeout %d)",
729 request->rq_timeout);
c5c4c6fa 730 ptlrpcd_add_req(request);
d7e09d03
PT
731 rc = 0;
732out:
c5c4c6fa 733 if (rc != 0)
d7e09d03 734 IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
d7e09d03 735
0a3bdb00 736 return rc;
d7e09d03
PT
737}
738EXPORT_SYMBOL(ptlrpc_connect_import);
739
740static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp)
741{
742 int force_verify;
743
744 spin_lock(&imp->imp_lock);
745 force_verify = imp->imp_force_verify != 0;
746 spin_unlock(&imp->imp_lock);
747
748 if (force_verify)
749 ptlrpc_pinger_wake_up();
750}
751
752static int ptlrpc_busy_reconnect(int rc)
753{
754 return (rc == -EBUSY) || (rc == -EAGAIN);
755}
756
757/**
758 * interpret_reply callback for connect RPCs.
759 * Looks into returned status of connect operation and decides
760 * what to do with the import - i.e enter recovery, promote it to
761 * full state for normal operations of disconnect it due to an error.
762 */
763static int ptlrpc_connect_interpret(const struct lu_env *env,
764 struct ptlrpc_request *request,
765 void *data, int rc)
766{
767 struct ptlrpc_connect_async_args *aa = data;
768 struct obd_import *imp = request->rq_import;
769 struct client_obd *cli = &imp->imp_obd->u.cli;
770 struct lustre_handle old_hdl;
771 __u64 old_connect_flags;
772 int msg_flags;
773 struct obd_connect_data *ocd;
774 struct obd_export *exp;
775 int ret;
d7e09d03
PT
776
777 spin_lock(&imp->imp_lock);
778 if (imp->imp_state == LUSTRE_IMP_CLOSED) {
779 imp->imp_connect_tried = 1;
780 spin_unlock(&imp->imp_lock);
0a3bdb00 781 return 0;
d7e09d03
PT
782 }
783
784 if (rc) {
785 /* if this reconnect to busy export - not need select new target
dadfcdab
OD
786 * for connecting
787 */
d7e09d03
PT
788 imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc);
789 spin_unlock(&imp->imp_lock);
790 ptlrpc_maybe_ping_import_soon(imp);
a9b3e8f3 791 goto out;
d7e09d03
PT
792 }
793 spin_unlock(&imp->imp_lock);
794
795 LASSERT(imp->imp_conn_current);
796
797 msg_flags = lustre_msg_get_op_flags(request->rq_repmsg);
798
799 ret = req_capsule_get_size(&request->rq_pill, &RMF_CONNECT_DATA,
800 RCL_SERVER);
801 /* server replied obd_connect_data is always bigger */
802 ocd = req_capsule_server_sized_get(&request->rq_pill,
803 &RMF_CONNECT_DATA, ret);
804
8b382089 805 if (!ocd) {
d7e09d03
PT
806 CERROR("%s: no connect data from server\n",
807 imp->imp_obd->obd_name);
808 rc = -EPROTO;
a9b3e8f3 809 goto out;
d7e09d03
PT
810 }
811
812 spin_lock(&imp->imp_lock);
813
814 /* All imports are pingable */
815 imp->imp_pingable = 1;
816 imp->imp_force_reconnect = 0;
817 imp->imp_force_verify = 0;
818
819 imp->imp_connect_data = *ocd;
820
821 CDEBUG(D_HA, "%s: connect to target with instance %u\n",
822 imp->imp_obd->obd_name, ocd->ocd_instance);
823 exp = class_conn2export(&imp->imp_dlm_handle);
824
825 spin_unlock(&imp->imp_lock);
826
827 /* check that server granted subset of flags we asked for. */
828 if ((ocd->ocd_connect_flags & imp->imp_connect_flags_orig) !=
829 ocd->ocd_connect_flags) {
55f5a824 830 CERROR("%s: Server didn't granted asked subset of flags: asked=%#llx grranted=%#llx\n",
1d8cb70c 831 imp->imp_obd->obd_name, imp->imp_connect_flags_orig,
d7e09d03 832 ocd->ocd_connect_flags);
a9b3e8f3
JL
833 rc = -EPROTO;
834 goto out;
d7e09d03
PT
835 }
836
837 if (!exp) {
838 /* This could happen if export is cleaned during the
dadfcdab
OD
839 * connect attempt
840 */
d7e09d03
PT
841 CERROR("%s: missing export after connect\n",
842 imp->imp_obd->obd_name);
a9b3e8f3
JL
843 rc = -ENODEV;
844 goto out;
d7e09d03
PT
845 }
846 old_connect_flags = exp_connect_flags(exp);
847 exp->exp_connect_data = *ocd;
848 imp->imp_obd->obd_self_export->exp_connect_data = *ocd;
849 class_export_put(exp);
850
851 obd_import_event(imp->imp_obd, imp, IMP_EVENT_OCD);
852
853 if (aa->pcaa_initial_connect) {
854 spin_lock(&imp->imp_lock);
855 if (msg_flags & MSG_CONNECT_REPLAYABLE) {
856 imp->imp_replayable = 1;
857 spin_unlock(&imp->imp_lock);
858 CDEBUG(D_HA, "connected to replayable target: %s\n",
859 obd2cli_tgt(imp->imp_obd));
860 } else {
861 imp->imp_replayable = 0;
862 spin_unlock(&imp->imp_lock);
863 }
864
865 /* if applies, adjust the imp->imp_msg_magic here
dadfcdab
OD
866 * according to reply flags
867 */
d7e09d03
PT
868
869 imp->imp_remote_handle =
870 *lustre_msg_get_handle(request->rq_repmsg);
871
872 /* Initial connects are allowed for clients with non-random
873 * uuids when servers are in recovery. Simply signal the
dadfcdab
OD
874 * servers replay is complete and wait in REPLAY_WAIT.
875 */
d7e09d03
PT
876 if (msg_flags & MSG_CONNECT_RECOVERING) {
877 CDEBUG(D_HA, "connect to %s during recovery\n",
878 obd2cli_tgt(imp->imp_obd));
879 IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
880 } else {
881 IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
882 ptlrpc_activate_import(imp);
883 }
884
a9b3e8f3
JL
885 rc = 0;
886 goto finish;
d7e09d03
PT
887 }
888
889 /* Determine what recovery state to move the import to. */
2b241d31 890 if (msg_flags & MSG_CONNECT_RECONNECT) {
d7e09d03
PT
891 memset(&old_hdl, 0, sizeof(old_hdl));
892 if (!memcmp(&old_hdl, lustre_msg_get_handle(request->rq_repmsg),
3949015e 893 sizeof(old_hdl))) {
55f5a824 894 LCONSOLE_WARN("Reconnect to %s (at @%s) failed due bad handle %#llx\n",
d7e09d03
PT
895 obd2cli_tgt(imp->imp_obd),
896 imp->imp_connection->c_remote_uuid.uuid,
897 imp->imp_dlm_handle.cookie);
a9b3e8f3
JL
898 rc = -ENOTCONN;
899 goto out;
d7e09d03
PT
900 }
901
902 if (memcmp(&imp->imp_remote_handle,
903 lustre_msg_get_handle(request->rq_repmsg),
904 sizeof(imp->imp_remote_handle))) {
905 int level = msg_flags & MSG_CONNECT_RECOVERING ?
906 D_HA : D_WARNING;
907
908 /* Bug 16611/14775: if server handle have changed,
909 * that means some sort of disconnection happened.
910 * If the server is not in recovery, that also means it
911 * already erased all of our state because of previous
912 * eviction. If it is in recovery - we are safe to
913 * participate since we can reestablish all of our state
dadfcdab
OD
914 * with server again
915 */
2b241d31 916 if ((msg_flags & MSG_CONNECT_RECOVERING)) {
b533ff4b 917 CDEBUG(level, "%s@%s changed server handle from %#llx to %#llx but is still in recovery\n",
d7e09d03
PT
918 obd2cli_tgt(imp->imp_obd),
919 imp->imp_connection->c_remote_uuid.uuid,
920 imp->imp_remote_handle.cookie,
921 lustre_msg_get_handle(
922 request->rq_repmsg)->cookie);
923 } else {
2d00bd17 924 LCONSOLE_WARN("Evicted from %s (at %s) after server handle changed from %#llx to %#llx\n",
d7e09d03
PT
925 obd2cli_tgt(imp->imp_obd),
926 imp->imp_connection-> \
927 c_remote_uuid.uuid,
928 imp->imp_remote_handle.cookie,
929 lustre_msg_get_handle(
2d00bd17 930 request->rq_repmsg)->cookie);
d7e09d03
PT
931 }
932
d7e09d03
PT
933 imp->imp_remote_handle =
934 *lustre_msg_get_handle(request->rq_repmsg);
935
2b241d31 936 if (!(msg_flags & MSG_CONNECT_RECOVERING)) {
d7e09d03 937 IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
a9b3e8f3
JL
938 rc = 0;
939 goto finish;
d7e09d03
PT
940 }
941
942 } else {
943 CDEBUG(D_HA, "reconnected to %s@%s after partition\n",
944 obd2cli_tgt(imp->imp_obd),
945 imp->imp_connection->c_remote_uuid.uuid);
946 }
947
948 if (imp->imp_invalid) {
2d00bd17
JP
949 CDEBUG(D_HA, "%s: reconnected but import is invalid; marking evicted\n",
950 imp->imp_obd->obd_name);
d7e09d03 951 IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
2b241d31 952 } else if (msg_flags & MSG_CONNECT_RECOVERING) {
d7e09d03
PT
953 CDEBUG(D_HA, "%s: reconnected to %s during replay\n",
954 imp->imp_obd->obd_name,
955 obd2cli_tgt(imp->imp_obd));
956
957 spin_lock(&imp->imp_lock);
958 imp->imp_resend_replay = 1;
959 spin_unlock(&imp->imp_lock);
960
502cb58e 961 IMPORT_SET_STATE(imp, imp->imp_replay_state);
d7e09d03
PT
962 } else {
963 IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
964 }
2b241d31 965 } else if ((msg_flags & MSG_CONNECT_RECOVERING) && !imp->imp_invalid) {
d7e09d03
PT
966 LASSERT(imp->imp_replayable);
967 imp->imp_remote_handle =
968 *lustre_msg_get_handle(request->rq_repmsg);
969 imp->imp_last_replay_transno = 0;
970 IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
971 } else {
2d00bd17
JP
972 DEBUG_REQ(D_HA, request, "%s: evicting (reconnect/recover flags not set: %x)",
973 imp->imp_obd->obd_name, msg_flags);
d7e09d03
PT
974 imp->imp_remote_handle =
975 *lustre_msg_get_handle(request->rq_repmsg);
976 IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
977 }
978
979 /* Sanity checks for a reconnected import. */
980 if (!(imp->imp_replayable) != !(msg_flags & MSG_CONNECT_REPLAYABLE)) {
2d00bd17 981 CERROR("imp_replayable flag does not match server after reconnect. We should LBUG right here.\n");
d7e09d03
PT
982 }
983
984 if (lustre_msg_get_last_committed(request->rq_repmsg) > 0 &&
985 lustre_msg_get_last_committed(request->rq_repmsg) <
986 aa->pcaa_peer_committed) {
2d00bd17 987 CERROR("%s went back in time (transno %lld was previously committed, server now claims %lld)! See https://bugzilla.lustre.org/show_bug.cgi?id=9646\n",
d7e09d03
PT
988 obd2cli_tgt(imp->imp_obd), aa->pcaa_peer_committed,
989 lustre_msg_get_last_committed(request->rq_repmsg));
990 }
991
992finish:
993 rc = ptlrpc_import_recovery_state_machine(imp);
994 if (rc != 0) {
995 if (rc == -ENOTCONN) {
2d00bd17 996 CDEBUG(D_HA, "evicted/aborted by %s@%s during recovery; invalidating and reconnecting\n",
d7e09d03
PT
997 obd2cli_tgt(imp->imp_obd),
998 imp->imp_connection->c_remote_uuid.uuid);
999 ptlrpc_connect_import(imp);
1000 imp->imp_connect_tried = 1;
0a3bdb00 1001 return 0;
d7e09d03
PT
1002 }
1003 } else {
1004
1005 spin_lock(&imp->imp_lock);
1006 list_del(&imp->imp_conn_current->oic_item);
30c0aa39 1007 list_add(&imp->imp_conn_current->oic_item, &imp->imp_conn_list);
d7e09d03
PT
1008 imp->imp_last_success_conn =
1009 imp->imp_conn_current->oic_last_attempt;
1010
1011 spin_unlock(&imp->imp_lock);
1012
f261f48a
FY
1013 if ((imp->imp_connect_flags_orig & OBD_CONNECT_IBITS) &&
1014 !(ocd->ocd_connect_flags & OBD_CONNECT_IBITS)) {
2d00bd17 1015 LCONSOLE_WARN("%s: MDS %s does not support ibits lock, either very old or invalid: requested %llx, replied %llx\n",
f261f48a
FY
1016 imp->imp_obd->obd_name,
1017 imp->imp_connection->c_remote_uuid.uuid,
1018 imp->imp_connect_flags_orig,
1019 ocd->ocd_connect_flags);
a9b3e8f3
JL
1020 rc = -EPROTO;
1021 goto out;
f261f48a 1022 }
d7e09d03
PT
1023
1024 if ((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
1025 (ocd->ocd_version > LUSTRE_VERSION_CODE +
1026 LUSTRE_VERSION_OFFSET_WARN ||
1027 ocd->ocd_version < LUSTRE_VERSION_CODE -
1028 LUSTRE_VERSION_OFFSET_WARN)) {
1029 /* Sigh, some compilers do not like #ifdef in the middle
dadfcdab
OD
1030 * of macro arguments
1031 */
2d00bd17
JP
1032 const char *older = "older. Consider upgrading server or downgrading client"
1033 ;
1034 const char *newer = "newer than client version. Consider upgrading client"
1035 ;
d7e09d03 1036
2d00bd17 1037 LCONSOLE_WARN("Server %s version (%d.%d.%d.%d) is much %s (%s)\n",
d7e09d03
PT
1038 obd2cli_tgt(imp->imp_obd),
1039 OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
1040 OBD_OCD_VERSION_MINOR(ocd->ocd_version),
1041 OBD_OCD_VERSION_PATCH(ocd->ocd_version),
1042 OBD_OCD_VERSION_FIX(ocd->ocd_version),
1043 ocd->ocd_version > LUSTRE_VERSION_CODE ?
1044 newer : older, LUSTRE_VERSION_STRING);
1045 }
1046
1047#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0)
1048 /* Check if server has LU-1252 fix applied to not always swab
1049 * the IR MNE entries. Do this only once per connection. This
1050 * fixup is version-limited, because we don't want to carry the
1051 * OBD_CONNECT_MNE_SWAB flag around forever, just so long as we
1052 * need interop with unpatched 2.2 servers. For newer servers,
dadfcdab
OD
1053 * the client will do MNE swabbing only as needed. LU-1644
1054 */
d7e09d03
PT
1055 if (unlikely((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
1056 !(ocd->ocd_connect_flags & OBD_CONNECT_MNE_SWAB) &&
1057 OBD_OCD_VERSION_MAJOR(ocd->ocd_version) == 2 &&
1058 OBD_OCD_VERSION_MINOR(ocd->ocd_version) == 2 &&
1059 OBD_OCD_VERSION_PATCH(ocd->ocd_version) < 55 &&
1060 strcmp(imp->imp_obd->obd_type->typ_name,
1061 LUSTRE_MGC_NAME) == 0))
1062 imp->imp_need_mne_swab = 1;
1063 else /* clear if server was upgraded since last connect */
1064 imp->imp_need_mne_swab = 0;
1065#else
1066#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab"
1067#endif
1068
1069 if (ocd->ocd_connect_flags & OBD_CONNECT_CKSUM) {
1070 /* We sent to the server ocd_cksum_types with bits set
1071 * for algorithms we understand. The server masked off
dadfcdab
OD
1072 * the checksum types it doesn't support
1073 */
d7e09d03
PT
1074 if ((ocd->ocd_cksum_types &
1075 cksum_types_supported_client()) == 0) {
2d00bd17 1076 LCONSOLE_WARN("The negotiation of the checksum algorithm to use with server %s failed (%x/%x), disabling checksums\n",
d7e09d03
PT
1077 obd2cli_tgt(imp->imp_obd),
1078 ocd->ocd_cksum_types,
1079 cksum_types_supported_client());
1080 cli->cl_checksum = 0;
1081 cli->cl_supp_cksum_types = OBD_CKSUM_ADLER;
1082 } else {
1083 cli->cl_supp_cksum_types = ocd->ocd_cksum_types;
1084 }
1085 } else {
1086 /* The server does not support OBD_CONNECT_CKSUM.
dadfcdab
OD
1087 * Enforce ADLER for backward compatibility
1088 */
d7e09d03
PT
1089 cli->cl_supp_cksum_types = OBD_CKSUM_ADLER;
1090 }
b533ff4b 1091 cli->cl_cksum_type = cksum_type_select(cli->cl_supp_cksum_types);
d7e09d03
PT
1092
1093 if (ocd->ocd_connect_flags & OBD_CONNECT_BRW_SIZE)
1094 cli->cl_max_pages_per_rpc =
09cbfeaf 1095 min(ocd->ocd_brw_size >> PAGE_SHIFT,
d7e09d03
PT
1096 cli->cl_max_pages_per_rpc);
1097 else if (imp->imp_connect_op == MDS_CONNECT ||
1098 imp->imp_connect_op == MGS_CONNECT)
1099 cli->cl_max_pages_per_rpc = 1;
1100
1101 /* Reset ns_connect_flags only for initial connect. It might be
1102 * changed in while using FS and if we reset it in reconnect
1103 * this leads to losing user settings done before such as
dadfcdab
OD
1104 * disable lru_resize, etc.
1105 */
d7e09d03
PT
1106 if (old_connect_flags != exp_connect_flags(exp) ||
1107 aa->pcaa_initial_connect) {
55f5a824
GKH
1108 CDEBUG(D_HA, "%s: Resetting ns_connect_flags to server flags: %#llx\n",
1109 imp->imp_obd->obd_name, ocd->ocd_connect_flags);
d7e09d03
PT
1110 imp->imp_obd->obd_namespace->ns_connect_flags =
1111 ocd->ocd_connect_flags;
1112 imp->imp_obd->obd_namespace->ns_orig_connect_flags =
1113 ocd->ocd_connect_flags;
1114 }
1115
1116 if ((ocd->ocd_connect_flags & OBD_CONNECT_AT) &&
1117 (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2))
1118 /* We need a per-message support flag, because
dadfcdab
OD
1119 * a. we don't know if the incoming connect reply
1120 * supports AT or not (in reply_in_callback)
1121 * until we unpack it.
1122 * b. failovered server means export and flags are gone
1123 * (in ptlrpc_send_reply).
1124 * Can only be set when we know AT is supported at
1125 * both ends
1126 */
d7e09d03
PT
1127 imp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT;
1128 else
1129 imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
1130
1131 if ((ocd->ocd_connect_flags & OBD_CONNECT_FULL20) &&
1132 (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2))
1133 imp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18;
1134 else
1135 imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
1136
1137 LASSERT((cli->cl_max_pages_per_rpc <= PTLRPC_MAX_BRW_PAGES) &&
1138 (cli->cl_max_pages_per_rpc > 0));
1139 }
1140
1141out:
1142 imp->imp_connect_tried = 1;
1143
1144 if (rc != 0) {
1145 IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
1146 if (rc == -EACCES) {
1147 /*
1148 * Give up trying to reconnect
1149 * EACCES means client has no permission for connection
1150 */
1151 imp->imp_obd->obd_no_recov = 1;
1152 ptlrpc_deactivate_import(imp);
1153 }
1154
1155 if (rc == -EPROTO) {
1156 struct obd_connect_data *ocd;
1157
1158 /* reply message might not be ready */
8b382089 1159 if (!request->rq_repmsg)
0a3bdb00 1160 return -EPROTO;
d7e09d03
PT
1161
1162 ocd = req_capsule_server_get(&request->rq_pill,
1163 &RMF_CONNECT_DATA);
1164 if (ocd &&
1165 (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
1166 (ocd->ocd_version != LUSTRE_VERSION_CODE)) {
532118c0
KM
1167 /*
1168 * Actually servers are only supposed to refuse
1169 * connection from liblustre clients, so we
1170 * should never see this from VFS context
1171 */
2d00bd17
JP
1172 LCONSOLE_ERROR_MSG(0x16a, "Server %s version (%d.%d.%d.%d) refused connection from this client with an incompatible version (%s). Client must be recompiled\n",
1173 obd2cli_tgt(imp->imp_obd),
1174 OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
1175 OBD_OCD_VERSION_MINOR(ocd->ocd_version),
1176 OBD_OCD_VERSION_PATCH(ocd->ocd_version),
1177 OBD_OCD_VERSION_FIX(ocd->ocd_version),
1178 LUSTRE_VERSION_STRING);
d7e09d03
PT
1179 ptlrpc_deactivate_import(imp);
1180 IMPORT_SET_STATE(imp, LUSTRE_IMP_CLOSED);
1181 }
0a3bdb00 1182 return -EPROTO;
d7e09d03
PT
1183 }
1184
1185 ptlrpc_maybe_ping_import_soon(imp);
1186
1187 CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n",
1188 obd2cli_tgt(imp->imp_obd),
1189 (char *)imp->imp_connection->c_remote_uuid.uuid, rc);
1190 }
1191
1192 wake_up_all(&imp->imp_recovery_waitq);
0a3bdb00 1193 return rc;
d7e09d03
PT
1194}
1195
1196/**
1197 * interpret callback for "completed replay" RPCs.
1198 * \see signal_completed_replay
1199 */
1200static int completed_replay_interpret(const struct lu_env *env,
1201 struct ptlrpc_request *req,
aff9d8e8 1202 void *data, int rc)
d7e09d03 1203{
d7e09d03
PT
1204 atomic_dec(&req->rq_import->imp_replay_inflight);
1205 if (req->rq_status == 0 &&
1206 !req->rq_import->imp_vbr_failed) {
1207 ptlrpc_import_recovery_state_machine(req->rq_import);
1208 } else {
1209 if (req->rq_import->imp_vbr_failed) {
1210 CDEBUG(D_WARNING,
1211 "%s: version recovery fails, reconnecting\n",
1212 req->rq_import->imp_obd->obd_name);
1213 } else {
2d00bd17 1214 CDEBUG(D_HA, "%s: LAST_REPLAY message error: %d, reconnecting\n",
d7e09d03
PT
1215 req->rq_import->imp_obd->obd_name,
1216 req->rq_status);
1217 }
1218 ptlrpc_connect_import(req->rq_import);
1219 }
1220
0a3bdb00 1221 return 0;
d7e09d03
PT
1222}
1223
1224/**
1225 * Let server know that we have no requests to replay anymore.
1226 * Achieved by just sending a PING request
1227 */
1228static int signal_completed_replay(struct obd_import *imp)
1229{
1230 struct ptlrpc_request *req;
d7e09d03
PT
1231
1232 if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_FINISH_REPLAY)))
0a3bdb00 1233 return 0;
d7e09d03
PT
1234
1235 LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
1236 atomic_inc(&imp->imp_replay_inflight);
1237
1238 req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, LUSTRE_OBD_VERSION,
1239 OBD_PING);
8b382089 1240 if (!req) {
d7e09d03 1241 atomic_dec(&imp->imp_replay_inflight);
0a3bdb00 1242 return -ENOMEM;
d7e09d03
PT
1243 }
1244
1245 ptlrpc_request_set_replen(req);
1246 req->rq_send_state = LUSTRE_IMP_REPLAY_WAIT;
1247 lustre_msg_add_flags(req->rq_reqmsg,
1248 MSG_LOCK_REPLAY_DONE | MSG_REQ_REPLAY_DONE);
1249 if (AT_OFF)
1250 req->rq_timeout *= 3;
1251 req->rq_interpret_reply = completed_replay_interpret;
1252
c5c4c6fa 1253 ptlrpcd_add_req(req);
0a3bdb00 1254 return 0;
d7e09d03
PT
1255}
1256
1257/**
1258 * In kernel code all import invalidation happens in its own
1259 * separate thread, so that whatever application happened to encounter
1260 * a problem could still be killed or otherwise continue
1261 */
1262static int ptlrpc_invalidate_import_thread(void *data)
1263{
1264 struct obd_import *imp = data;
1265
d7e09d03
PT
1266 unshare_fs_struct();
1267
1268 CDEBUG(D_HA, "thread invalidate import %s to %s@%s\n",
1269 imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
1270 imp->imp_connection->c_remote_uuid.uuid);
1271
1272 ptlrpc_invalidate_import(imp);
1273
1274 if (obd_dump_on_eviction) {
1275 CERROR("dump the log upon eviction\n");
1276 libcfs_debug_dumplog();
1277 }
1278
1279 IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
1280 ptlrpc_import_recovery_state_machine(imp);
1281
1282 class_import_put(imp);
0a3bdb00 1283 return 0;
d7e09d03
PT
1284}
1285
1286/**
1287 * This is the state machine for client-side recovery on import.
1288 *
b6da17f3 1289 * Typically we have two possibly paths. If we came to server and it is not
d7e09d03
PT
1290 * in recovery, we just enter IMP_EVICTED state, invalidate our import
1291 * state and reconnect from scratch.
1292 * If we came to server that is in recovery, we enter IMP_REPLAY import state.
1293 * We go through our list of requests to replay and send them to server one by
1294 * one.
1295 * After sending all request from the list we change import state to
1296 * IMP_REPLAY_LOCKS and re-request all the locks we believe we have from server
1297 * and also all the locks we don't yet have and wait for server to grant us.
1298 * After that we send a special "replay completed" request and change import
1299 * state to IMP_REPLAY_WAIT.
1300 * Upon receiving reply to that "replay completed" RPC we enter IMP_RECOVER
1301 * state and resend all requests from sending list.
1302 * After that we promote import to FULL state and send all delayed requests
1303 * and import is fully operational after that.
1304 *
1305 */
1306int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
1307{
1308 int rc = 0;
1309 int inflight;
1310 char *target_start;
1311 int target_len;
1312
d7e09d03
PT
1313 if (imp->imp_state == LUSTRE_IMP_EVICTED) {
1314 deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
1315 &target_start, &target_len);
1316 /* Don't care about MGC eviction */
1317 if (strcmp(imp->imp_obd->obd_type->typ_name,
1318 LUSTRE_MGC_NAME) != 0) {
2d00bd17 1319 LCONSOLE_ERROR_MSG(0x167, "%s: This client was evicted by %.*s; in progress operations using this service will fail.\n",
d7e09d03
PT
1320 imp->imp_obd->obd_name, target_len,
1321 target_start);
1322 }
1323 CDEBUG(D_HA, "evicted from %s@%s; invalidating\n",
1324 obd2cli_tgt(imp->imp_obd),
1325 imp->imp_connection->c_remote_uuid.uuid);
1326 /* reset vbr_failed flag upon eviction */
1327 spin_lock(&imp->imp_lock);
1328 imp->imp_vbr_failed = 0;
1329 spin_unlock(&imp->imp_lock);
1330
1331 {
68b636b6 1332 struct task_struct *task;
d7e09d03 1333 /* bug 17802: XXX client_disconnect_export vs connect request
9c379663 1334 * race. if client is evicted at this time, we start
d7e09d03 1335 * invalidate thread without reference to import and import can
dadfcdab
OD
1336 * be freed at same time.
1337 */
d7e09d03
PT
1338 class_import_get(imp);
1339 task = kthread_run(ptlrpc_invalidate_import_thread, imp,
30c0aa39 1340 "ll_imp_inval");
d7e09d03
PT
1341 if (IS_ERR(task)) {
1342 class_import_put(imp);
1343 CERROR("error starting invalidate thread: %d\n", rc);
1344 rc = PTR_ERR(task);
1345 } else {
1346 rc = 0;
1347 }
0a3bdb00 1348 return rc;
d7e09d03
PT
1349 }
1350 }
1351
1352 if (imp->imp_state == LUSTRE_IMP_REPLAY) {
1353 CDEBUG(D_HA, "replay requested by %s\n",
1354 obd2cli_tgt(imp->imp_obd));
1355 rc = ptlrpc_replay_next(imp, &inflight);
1356 if (inflight == 0 &&
1357 atomic_read(&imp->imp_replay_inflight) == 0) {
1358 IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
1359 rc = ldlm_replay_locks(imp);
1360 if (rc)
a9b3e8f3 1361 goto out;
d7e09d03
PT
1362 }
1363 rc = 0;
1364 }
1365
1366 if (imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS) {
1367 if (atomic_read(&imp->imp_replay_inflight) == 0) {
1368 IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_WAIT);
1369 rc = signal_completed_replay(imp);
1370 if (rc)
a9b3e8f3 1371 goto out;
d7e09d03
PT
1372 }
1373
1374 }
1375
1376 if (imp->imp_state == LUSTRE_IMP_REPLAY_WAIT) {
1377 if (atomic_read(&imp->imp_replay_inflight) == 0) {
1378 IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
1379 }
1380 }
1381
1382 if (imp->imp_state == LUSTRE_IMP_RECOVER) {
1383 CDEBUG(D_HA, "reconnected to %s@%s\n",
1384 obd2cli_tgt(imp->imp_obd),
1385 imp->imp_connection->c_remote_uuid.uuid);
1386
1387 rc = ptlrpc_resend(imp);
1388 if (rc)
a9b3e8f3 1389 goto out;
d7e09d03
PT
1390 IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
1391 ptlrpc_activate_import(imp);
1392
1393 deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
1394 &target_start, &target_len);
1395 LCONSOLE_INFO("%s: Connection restored to %.*s (at %s)\n",
1396 imp->imp_obd->obd_name,
1397 target_len, target_start,
1398 libcfs_nid2str(imp->imp_connection->c_peer.nid));
1399 }
1400
1401 if (imp->imp_state == LUSTRE_IMP_FULL) {
1402 wake_up_all(&imp->imp_recovery_waitq);
1403 ptlrpc_wake_delayed(imp);
1404 }
1405
1406out:
0a3bdb00 1407 return rc;
d7e09d03
PT
1408}
1409
1410int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
1411{
1412 struct ptlrpc_request *req;
1413 int rq_opc, rc = 0;
d7e09d03 1414
88291a7a 1415 if (imp->imp_obd->obd_force)
a9b3e8f3 1416 goto set_state;
d7e09d03
PT
1417
1418 switch (imp->imp_connect_op) {
88291a7a
AD
1419 case OST_CONNECT:
1420 rq_opc = OST_DISCONNECT;
1421 break;
1422 case MDS_CONNECT:
1423 rq_opc = MDS_DISCONNECT;
1424 break;
1425 case MGS_CONNECT:
1426 rq_opc = MGS_DISCONNECT;
1427 break;
d7e09d03 1428 default:
88291a7a 1429 rc = -EINVAL;
2d00bd17 1430 CERROR("%s: don't know how to disconnect from %s (connect_op %d): rc = %d\n",
88291a7a
AD
1431 imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
1432 imp->imp_connect_op, rc);
1433 return rc;
d7e09d03
PT
1434 }
1435
1436 if (ptlrpc_import_in_recovery(imp)) {
1437 struct l_wait_info lwi;
b2d201bd 1438 long timeout;
d7e09d03 1439
d7e09d03
PT
1440 if (AT_OFF) {
1441 if (imp->imp_server_timeout)
1442 timeout = cfs_time_seconds(obd_timeout / 2);
1443 else
1444 timeout = cfs_time_seconds(obd_timeout);
1445 } else {
1446 int idx = import_at_get_index(imp,
1447 imp->imp_client->cli_request_portal);
1448 timeout = cfs_time_seconds(
1449 at_get(&imp->imp_at.iat_service_estimate[idx]));
1450 }
1451
1452 lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout),
1453 back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL);
1454 rc = l_wait_event(imp->imp_recovery_waitq,
1455 !ptlrpc_import_in_recovery(imp), &lwi);
1456
1457 }
1458
1459 spin_lock(&imp->imp_lock);
1460 if (imp->imp_state != LUSTRE_IMP_FULL)
a9b3e8f3 1461 goto out;
d7e09d03
PT
1462 spin_unlock(&imp->imp_lock);
1463
1464 req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_DISCONNECT,
1465 LUSTRE_OBD_VERSION, rq_opc);
1466 if (req) {
1467 /* We are disconnecting, do not retry a failed DISCONNECT rpc if
1468 * it fails. We can get through the above with a down server
dadfcdab
OD
1469 * if the client doesn't know the server is gone yet.
1470 */
d7e09d03
PT
1471 req->rq_no_resend = 1;
1472
1473 /* We want client umounts to happen quickly, no matter the
dadfcdab
OD
1474 * server state...
1475 */
d7e09d03
PT
1476 req->rq_timeout = min_t(int, req->rq_timeout,
1477 INITIAL_CONNECT_TIMEOUT);
1478
1479 IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING);
d0bfef31 1480 req->rq_send_state = LUSTRE_IMP_CONNECTING;
d7e09d03
PT
1481 ptlrpc_request_set_replen(req);
1482 rc = ptlrpc_queue_wait(req);
1483 ptlrpc_req_finished(req);
1484 }
1485
1486set_state:
1487 spin_lock(&imp->imp_lock);
1488out:
1489 if (noclose)
1490 IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
1491 else
1492 IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
1493 memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle));
1494 spin_unlock(&imp->imp_lock);
1495
88291a7a
AD
1496 if (rc == -ETIMEDOUT || rc == -ENOTCONN || rc == -ESHUTDOWN)
1497 rc = 0;
1498
0a3bdb00 1499 return rc;
d7e09d03
PT
1500}
1501EXPORT_SYMBOL(ptlrpc_disconnect_import);
1502
d7e09d03
PT
1503/* Adaptive Timeout utils */
1504extern unsigned int at_min, at_max, at_history;
1505
1506/* Bin into timeslices using AT_BINS bins.
dadfcdab
OD
1507 * This gives us a max of the last binlimit*AT_BINS secs without the storage,
1508 * but still smoothing out a return to normalcy from a slow response.
1509 * (E.g. remember the maximum latency in each minute of the last 4 minutes.)
1510 */
d7e09d03
PT
1511int at_measured(struct adaptive_timeout *at, unsigned int val)
1512{
1513 unsigned int old = at->at_current;
0ac0478b
AB
1514 time64_t now = ktime_get_real_seconds();
1515 long binlimit = max_t(long, at_history / AT_BINS, 1);
d7e09d03
PT
1516
1517 LASSERT(at);
1518 CDEBUG(D_OTHER, "add %u to %p time=%lu v=%u (%u %u %u %u)\n",
0ac0478b 1519 val, at, (long)(now - at->at_binstart), at->at_current,
d7e09d03
PT
1520 at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]);
1521
1522 if (val == 0)
1523 /* 0's don't count, because we never want our timeout to
dadfcdab
OD
1524 * drop to 0, and because 0 could mean an error
1525 */
d7e09d03
PT
1526 return 0;
1527
1528 spin_lock(&at->at_lock);
1529
1530 if (unlikely(at->at_binstart == 0)) {
1531 /* Special case to remove default from history */
1532 at->at_current = val;
1533 at->at_worst_ever = val;
1534 at->at_worst_time = now;
1535 at->at_hist[0] = val;
1536 at->at_binstart = now;
3949015e 1537 } else if (now - at->at_binstart < binlimit) {
d7e09d03
PT
1538 /* in bin 0 */
1539 at->at_hist[0] = max(val, at->at_hist[0]);
1540 at->at_current = max(val, at->at_current);
1541 } else {
1542 int i, shift;
1543 unsigned int maxv = val;
1544 /* move bins over */
0ac0478b 1545 shift = (u32)(now - at->at_binstart) / binlimit;
d7e09d03 1546 LASSERT(shift > 0);
3949015e 1547 for (i = AT_BINS - 1; i >= 0; i--) {
d7e09d03
PT
1548 if (i >= shift) {
1549 at->at_hist[i] = at->at_hist[i - shift];
1550 maxv = max(maxv, at->at_hist[i]);
1551 } else {
1552 at->at_hist[i] = 0;
1553 }
1554 }
1555 at->at_hist[0] = val;
1556 at->at_current = maxv;
1557 at->at_binstart += shift * binlimit;
1558 }
1559
1560 if (at->at_current > at->at_worst_ever) {
1561 at->at_worst_ever = at->at_current;
1562 at->at_worst_time = now;
1563 }
1564
1565 if (at->at_flags & AT_FLG_NOHIST)
1566 /* Only keep last reported val; keeping the rest of the history
dadfcdab
OD
1567 * for debugfs only
1568 */
d7e09d03
PT
1569 at->at_current = val;
1570
1571 if (at_max > 0)
1572 at->at_current = min(at->at_current, at_max);
1573 at->at_current = max(at->at_current, at_min);
1574
1575 if (at->at_current != old)
2d00bd17
JP
1576 CDEBUG(D_OTHER, "AT %p change: old=%u new=%u delta=%d (val=%u) hist %u %u %u %u\n",
1577 at,
d7e09d03
PT
1578 old, at->at_current, at->at_current - old, val,
1579 at->at_hist[0], at->at_hist[1], at->at_hist[2],
1580 at->at_hist[3]);
1581
1582 /* if we changed, report the old value */
1583 old = (at->at_current != old) ? old : 0;
1584
1585 spin_unlock(&at->at_lock);
1586 return old;
1587}
1588
1589/* Find the imp_at index for a given portal; assign if space available */
1590int import_at_get_index(struct obd_import *imp, int portal)
1591{
1592 struct imp_at *at = &imp->imp_at;
1593 int i;
1594
1595 for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
1596 if (at->iat_portal[i] == portal)
1597 return i;
1598 if (at->iat_portal[i] == 0)
1599 /* unused */
1600 break;
1601 }
1602
1603 /* Not found in list, add it under a lock */
1604 spin_lock(&imp->imp_lock);
1605
1606 /* Check unused under lock */
1607 for (; i < IMP_AT_MAX_PORTALS; i++) {
1608 if (at->iat_portal[i] == portal)
1609 goto out;
1610 if (at->iat_portal[i] == 0)
1611 /* unused */
1612 break;
1613 }
1614
1615 /* Not enough portals? */
1616 LASSERT(i < IMP_AT_MAX_PORTALS);
1617
1618 at->iat_portal[i] = portal;
1619out:
1620 spin_unlock(&imp->imp_lock);
1621 return i;
1622}
This page took 0.559448 seconds and 5 git commands to generate.