1 #include "ceph_debug.h"
3 #include <linux/types.h>
4 #include <linux/random.h>
5 #include <linux/sched.h>
7 #include "mon_client.h"
13 * Interact with Ceph monitor cluster. Handle requests for new map
14 * versions, and periodically resend as needed. Also implement
15 * statfs() and umount().
17 * A small cluster of Ceph "monitors" are responsible for managing critical
18 * cluster configuration and state information. An odd number (e.g., 3, 5)
19 * of cmon daemons use a modified version of the Paxos part-time parliament
20 * algorithm to manage the MDS map (mds cluster membership), OSD map, and
21 * list of clients who have mounted the file system.
23 * We maintain an open, active session with a monitor at all times in order to
24 * receive timely MDSMap updates. We periodically send a keepalive byte on the
25 * TCP socket to ensure we detect a failure. If the connection does break, we
26 * randomly hunt for a new monitor. Once the connection is reestablished, we
27 * resend any outstanding requests.
30 const static struct ceph_connection_operations mon_con_ops
;
32 static int __validate_auth(struct ceph_mon_client
*monc
);
35 * Decode a monmap blob (e.g., during mount).
37 struct ceph_monmap
*ceph_monmap_decode(void *p
, void *end
)
39 struct ceph_monmap
*m
= NULL
;
41 struct ceph_fsid fsid
;
46 ceph_decode_32_safe(&p
, end
, len
, bad
);
47 ceph_decode_need(&p
, end
, len
, bad
);
49 dout("monmap_decode %p %p len %d\n", p
, end
, (int)(end
-p
));
51 ceph_decode_16_safe(&p
, end
, version
, bad
);
53 ceph_decode_need(&p
, end
, sizeof(fsid
) + 2*sizeof(u32
), bad
);
54 ceph_decode_copy(&p
, &fsid
, sizeof(fsid
));
55 epoch
= ceph_decode_32(&p
);
57 num_mon
= ceph_decode_32(&p
);
58 ceph_decode_need(&p
, end
, num_mon
*sizeof(m
->mon_inst
[0]), bad
);
60 if (num_mon
>= CEPH_MAX_MON
)
62 m
= kmalloc(sizeof(*m
) + sizeof(m
->mon_inst
[0])*num_mon
, GFP_NOFS
);
64 return ERR_PTR(-ENOMEM
);
68 ceph_decode_copy(&p
, m
->mon_inst
, num_mon
*sizeof(m
->mon_inst
[0]));
69 for (i
= 0; i
< num_mon
; i
++)
70 ceph_decode_addr(&m
->mon_inst
[i
].addr
);
72 dout("monmap_decode epoch %d, num_mon %d\n", m
->epoch
,
74 for (i
= 0; i
< m
->num_mon
; i
++)
75 dout("monmap_decode mon%d is %s\n", i
,
76 pr_addr(&m
->mon_inst
[i
].addr
.in_addr
));
80 dout("monmap_decode failed with %d\n", err
);
86 * return true if *addr is included in the monmap.
88 int ceph_monmap_contains(struct ceph_monmap
*m
, struct ceph_entity_addr
*addr
)
92 for (i
= 0; i
< m
->num_mon
; i
++)
93 if (memcmp(addr
, &m
->mon_inst
[i
].addr
, sizeof(*addr
)) == 0)
99 * Close monitor session, if any.
101 static void __close_session(struct ceph_mon_client
*monc
)
104 dout("__close_session closing mon%d\n", monc
->cur_mon
);
105 ceph_con_revoke(monc
->con
, monc
->m_auth
);
106 ceph_con_close(monc
->con
);
108 monc
->pending_auth
= 0;
109 ceph_auth_reset(monc
->auth
);
114 * Open a session with a (new) monitor.
116 static int __open_session(struct ceph_mon_client
*monc
)
121 if (monc
->cur_mon
< 0) {
122 get_random_bytes(&r
, 1);
123 monc
->cur_mon
= r
% monc
->monmap
->num_mon
;
124 dout("open_session num=%d r=%d -> mon%d\n",
125 monc
->monmap
->num_mon
, r
, monc
->cur_mon
);
127 monc
->sub_renew_after
= jiffies
; /* i.e., expired */
128 monc
->want_next_osdmap
= !!monc
->want_next_osdmap
;
130 dout("open_session mon%d opening\n", monc
->cur_mon
);
131 monc
->con
->peer_name
.type
= CEPH_ENTITY_TYPE_MON
;
132 monc
->con
->peer_name
.num
= cpu_to_le64(monc
->cur_mon
);
133 ceph_con_open(monc
->con
,
134 &monc
->monmap
->mon_inst
[monc
->cur_mon
].addr
);
136 /* initiatiate authentication handshake */
137 ret
= ceph_auth_build_hello(monc
->auth
,
138 monc
->m_auth
->front
.iov_base
,
139 monc
->m_auth
->front_max
);
140 monc
->m_auth
->front
.iov_len
= ret
;
141 monc
->m_auth
->hdr
.front_len
= cpu_to_le32(ret
);
142 ceph_msg_get(monc
->m_auth
); /* keep our ref */
143 ceph_con_send(monc
->con
, monc
->m_auth
);
145 dout("open_session mon%d already open\n", monc
->cur_mon
);
150 static bool __sub_expired(struct ceph_mon_client
*monc
)
152 return time_after_eq(jiffies
, monc
->sub_renew_after
);
156 * Reschedule delayed work timer.
158 static void __schedule_delayed(struct ceph_mon_client
*monc
)
162 if (monc
->cur_mon
< 0 || __sub_expired(monc
))
166 dout("__schedule_delayed after %u\n", delay
);
167 schedule_delayed_work(&monc
->delayed_work
, delay
);
171 * Send subscribe request for mdsmap and/or osdmap.
173 static void __send_subscribe(struct ceph_mon_client
*monc
)
175 dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
176 (unsigned)monc
->sub_sent
, __sub_expired(monc
),
177 monc
->want_next_osdmap
);
178 if ((__sub_expired(monc
) && !monc
->sub_sent
) ||
179 monc
->want_next_osdmap
== 1) {
180 struct ceph_msg
*msg
;
181 struct ceph_mon_subscribe_item
*i
;
184 msg
= ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE
, 96, 0, 0, NULL
);
188 p
= msg
->front
.iov_base
;
189 end
= p
+ msg
->front
.iov_len
;
191 dout("__send_subscribe to 'mdsmap' %u+\n",
192 (unsigned)monc
->have_mdsmap
);
193 if (monc
->want_next_osdmap
) {
194 dout("__send_subscribe to 'osdmap' %u\n",
195 (unsigned)monc
->have_osdmap
);
196 ceph_encode_32(&p
, 3);
197 ceph_encode_string(&p
, end
, "osdmap", 6);
199 i
->have
= cpu_to_le64(monc
->have_osdmap
);
202 monc
->want_next_osdmap
= 2; /* requested */
204 ceph_encode_32(&p
, 2);
206 ceph_encode_string(&p
, end
, "mdsmap", 6);
208 i
->have
= cpu_to_le64(monc
->have_mdsmap
);
211 ceph_encode_string(&p
, end
, "monmap", 6);
217 msg
->front
.iov_len
= p
- msg
->front
.iov_base
;
218 msg
->hdr
.front_len
= cpu_to_le32(msg
->front
.iov_len
);
219 ceph_con_send(monc
->con
, msg
);
221 monc
->sub_sent
= jiffies
| 1; /* never 0 */
225 static void handle_subscribe_ack(struct ceph_mon_client
*monc
,
226 struct ceph_msg
*msg
)
229 struct ceph_mon_subscribe_ack
*h
= msg
->front
.iov_base
;
231 if (msg
->front
.iov_len
< sizeof(*h
))
233 seconds
= le32_to_cpu(h
->duration
);
235 mutex_lock(&monc
->mutex
);
237 pr_info("mon%d %s session established\n",
238 monc
->cur_mon
, pr_addr(&monc
->con
->peer_addr
.in_addr
));
239 monc
->hunting
= false;
241 dout("handle_subscribe_ack after %d seconds\n", seconds
);
242 monc
->sub_renew_after
= monc
->sub_sent
+ (seconds
>> 1)*HZ
- 1;
244 mutex_unlock(&monc
->mutex
);
247 pr_err("got corrupt subscribe-ack msg\n");
252 * Keep track of which maps we have
254 int ceph_monc_got_mdsmap(struct ceph_mon_client
*monc
, u32 got
)
256 mutex_lock(&monc
->mutex
);
257 monc
->have_mdsmap
= got
;
258 mutex_unlock(&monc
->mutex
);
262 int ceph_monc_got_osdmap(struct ceph_mon_client
*monc
, u32 got
)
264 mutex_lock(&monc
->mutex
);
265 monc
->have_osdmap
= got
;
266 monc
->want_next_osdmap
= 0;
267 mutex_unlock(&monc
->mutex
);
272 * Register interest in the next osdmap
274 void ceph_monc_request_next_osdmap(struct ceph_mon_client
*monc
)
276 dout("request_next_osdmap have %u\n", monc
->have_osdmap
);
277 mutex_lock(&monc
->mutex
);
278 if (!monc
->want_next_osdmap
)
279 monc
->want_next_osdmap
= 1;
280 if (monc
->want_next_osdmap
< 2)
281 __send_subscribe(monc
);
282 mutex_unlock(&monc
->mutex
);
288 int ceph_monc_open_session(struct ceph_mon_client
*monc
)
291 monc
->con
= kmalloc(sizeof(*monc
->con
), GFP_KERNEL
);
294 ceph_con_init(monc
->client
->msgr
, monc
->con
);
295 monc
->con
->private = monc
;
296 monc
->con
->ops
= &mon_con_ops
;
299 mutex_lock(&monc
->mutex
);
300 __open_session(monc
);
301 __schedule_delayed(monc
);
302 mutex_unlock(&monc
->mutex
);
307 * The monitor responds with mount ack indicate mount success. The
308 * included client ticket allows the client to talk to MDSs and OSDs.
310 static void ceph_monc_handle_map(struct ceph_mon_client
*monc
,
311 struct ceph_msg
*msg
)
313 struct ceph_client
*client
= monc
->client
;
314 struct ceph_monmap
*monmap
= NULL
, *old
= monc
->monmap
;
317 mutex_lock(&monc
->mutex
);
319 dout("handle_monmap\n");
320 p
= msg
->front
.iov_base
;
321 end
= p
+ msg
->front
.iov_len
;
323 monmap
= ceph_monmap_decode(p
, end
);
324 if (IS_ERR(monmap
)) {
325 pr_err("problem decoding monmap, %d\n",
326 (int)PTR_ERR(monmap
));
330 if (ceph_check_fsid(monc
->client
, &monmap
->fsid
) < 0) {
335 client
->monc
.monmap
= monmap
;
339 mutex_unlock(&monc
->mutex
);
340 wake_up(&client
->auth_wq
);
346 static void handle_statfs_reply(struct ceph_mon_client
*monc
,
347 struct ceph_msg
*msg
)
349 struct ceph_mon_statfs_request
*req
;
350 struct ceph_mon_statfs_reply
*reply
= msg
->front
.iov_base
;
353 if (msg
->front
.iov_len
!= sizeof(*reply
))
355 tid
= le64_to_cpu(msg
->hdr
.tid
);
356 dout("handle_statfs_reply %p tid %llu\n", msg
, tid
);
358 mutex_lock(&monc
->mutex
);
359 req
= radix_tree_lookup(&monc
->statfs_request_tree
, tid
);
361 *req
->buf
= reply
->st
;
364 mutex_unlock(&monc
->mutex
);
366 complete(&req
->completion
);
370 pr_err("corrupt statfs reply, no tid\n");
375 * (re)send a statfs request
377 static int send_statfs(struct ceph_mon_client
*monc
,
378 struct ceph_mon_statfs_request
*req
)
380 struct ceph_msg
*msg
;
381 struct ceph_mon_statfs
*h
;
383 dout("send_statfs tid %llu\n", req
->tid
);
384 msg
= ceph_msg_new(CEPH_MSG_STATFS
, sizeof(*h
), 0, 0, NULL
);
388 msg
->hdr
.tid
= cpu_to_le64(req
->tid
);
389 h
= msg
->front
.iov_base
;
390 h
->monhdr
.have_version
= 0;
391 h
->monhdr
.session_mon
= cpu_to_le16(-1);
392 h
->monhdr
.session_mon_tid
= 0;
393 h
->fsid
= monc
->monmap
->fsid
;
394 ceph_con_send(monc
->con
, msg
);
399 * Do a synchronous statfs().
401 int ceph_monc_do_statfs(struct ceph_mon_client
*monc
, struct ceph_statfs
*buf
)
403 struct ceph_mon_statfs_request req
;
407 init_completion(&req
.completion
);
409 /* allocate memory for reply */
410 err
= ceph_msgpool_resv(&monc
->msgpool_statfs_reply
, 1);
414 /* register request */
415 mutex_lock(&monc
->mutex
);
416 req
.tid
= ++monc
->last_tid
;
417 req
.last_attempt
= jiffies
;
418 req
.delay
= BASE_DELAY_INTERVAL
;
419 if (radix_tree_insert(&monc
->statfs_request_tree
, req
.tid
, &req
) < 0) {
420 mutex_unlock(&monc
->mutex
);
421 pr_err("ENOMEM in do_statfs\n");
424 monc
->num_statfs_requests
++;
425 mutex_unlock(&monc
->mutex
);
427 /* send request and wait */
428 err
= send_statfs(monc
, &req
);
430 err
= wait_for_completion_interruptible(&req
.completion
);
432 mutex_lock(&monc
->mutex
);
433 radix_tree_delete(&monc
->statfs_request_tree
, req
.tid
);
434 monc
->num_statfs_requests
--;
435 ceph_msgpool_resv(&monc
->msgpool_statfs_reply
, -1);
436 mutex_unlock(&monc
->mutex
);
444 * Resend pending statfs requests.
446 static void __resend_statfs(struct ceph_mon_client
*monc
)
451 struct ceph_mon_statfs_request
*req
;
454 got
= radix_tree_gang_lookup(&monc
->statfs_request_tree
,
460 next_tid
= req
->tid
+ 1;
462 send_statfs(monc
, req
);
467 * Delayed work. If we haven't mounted yet, retry. Otherwise,
468 * renew/retry subscription as needed (in case it is timing out, or we
469 * got an ENOMEM). And keep the monitor connection alive.
471 static void delayed_work(struct work_struct
*work
)
473 struct ceph_mon_client
*monc
=
474 container_of(work
, struct ceph_mon_client
, delayed_work
.work
);
476 dout("monc delayed_work\n");
477 mutex_lock(&monc
->mutex
);
479 __close_session(monc
);
480 __open_session(monc
); /* continue hunting */
482 ceph_con_keepalive(monc
->con
);
483 mutex_unlock(&monc
->mutex
);
485 __validate_auth(monc
);
487 mutex_lock(&monc
->mutex
);
488 if (monc
->auth
->ops
->is_authenticated(monc
->auth
))
489 __send_subscribe(monc
);
491 __schedule_delayed(monc
);
492 mutex_unlock(&monc
->mutex
);
496 * On startup, we build a temporary monmap populated with the IPs
497 * provided by mount(2).
499 static int build_initial_monmap(struct ceph_mon_client
*monc
)
501 struct ceph_mount_args
*args
= monc
->client
->mount_args
;
502 struct ceph_entity_addr
*mon_addr
= args
->mon_addr
;
503 int num_mon
= args
->num_mon
;
506 /* build initial monmap */
507 monc
->monmap
= kzalloc(sizeof(*monc
->monmap
) +
508 num_mon
*sizeof(monc
->monmap
->mon_inst
[0]),
512 for (i
= 0; i
< num_mon
; i
++) {
513 monc
->monmap
->mon_inst
[i
].addr
= mon_addr
[i
];
514 monc
->monmap
->mon_inst
[i
].addr
.nonce
= 0;
515 monc
->monmap
->mon_inst
[i
].name
.type
=
516 CEPH_ENTITY_TYPE_MON
;
517 monc
->monmap
->mon_inst
[i
].name
.num
= cpu_to_le64(i
);
519 monc
->monmap
->num_mon
= num_mon
;
520 monc
->have_fsid
= false;
522 /* release addr memory */
523 kfree(args
->mon_addr
);
524 args
->mon_addr
= NULL
;
529 int ceph_monc_init(struct ceph_mon_client
*monc
, struct ceph_client
*cl
)
534 memset(monc
, 0, sizeof(*monc
));
537 mutex_init(&monc
->mutex
);
539 err
= build_initial_monmap(monc
);
546 monc
->auth
= ceph_auth_init(cl
->mount_args
->name
,
547 cl
->mount_args
->secret
);
548 if (IS_ERR(monc
->auth
))
549 return PTR_ERR(monc
->auth
);
550 monc
->auth
->want_keys
=
551 CEPH_ENTITY_TYPE_AUTH
| CEPH_ENTITY_TYPE_MON
|
552 CEPH_ENTITY_TYPE_OSD
| CEPH_ENTITY_TYPE_MDS
;
555 err
= ceph_msgpool_init(&monc
->msgpool_subscribe_ack
,
556 sizeof(struct ceph_mon_subscribe_ack
), 1, false);
559 err
= ceph_msgpool_init(&monc
->msgpool_statfs_reply
,
560 sizeof(struct ceph_mon_statfs_reply
), 0, false);
563 err
= ceph_msgpool_init(&monc
->msgpool_auth_reply
, 4096, 1, false);
567 monc
->m_auth
= ceph_msg_new(CEPH_MSG_AUTH
, 4096, 0, 0, NULL
);
568 monc
->pending_auth
= 0;
569 if (IS_ERR(monc
->m_auth
)) {
570 err
= PTR_ERR(monc
->m_auth
);
576 monc
->hunting
= true;
577 monc
->sub_renew_after
= jiffies
;
580 INIT_DELAYED_WORK(&monc
->delayed_work
, delayed_work
);
581 INIT_RADIX_TREE(&monc
->statfs_request_tree
, GFP_NOFS
);
582 monc
->num_statfs_requests
= 0;
585 monc
->have_mdsmap
= 0;
586 monc
->have_osdmap
= 0;
587 monc
->want_next_osdmap
= 1;
591 ceph_msgpool_destroy(&monc
->msgpool_auth_reply
);
593 ceph_msgpool_destroy(&monc
->msgpool_subscribe_ack
);
595 ceph_msgpool_destroy(&monc
->msgpool_statfs_reply
);
602 void ceph_monc_stop(struct ceph_mon_client
*monc
)
605 cancel_delayed_work_sync(&monc
->delayed_work
);
607 mutex_lock(&monc
->mutex
);
608 __close_session(monc
);
610 monc
->con
->private = NULL
;
611 monc
->con
->ops
->put(monc
->con
);
614 mutex_unlock(&monc
->mutex
);
616 ceph_auth_destroy(monc
->auth
);
618 ceph_msg_put(monc
->m_auth
);
619 ceph_msgpool_destroy(&monc
->msgpool_subscribe_ack
);
620 ceph_msgpool_destroy(&monc
->msgpool_statfs_reply
);
621 ceph_msgpool_destroy(&monc
->msgpool_auth_reply
);
626 static void __send_prepared_auth_request(struct ceph_mon_client
*monc
, int len
)
628 monc
->pending_auth
= 1;
629 monc
->m_auth
->front
.iov_len
= len
;
630 monc
->m_auth
->hdr
.front_len
= cpu_to_le32(len
);
631 ceph_msg_get(monc
->m_auth
); /* keep our ref */
632 ceph_con_send(monc
->con
, monc
->m_auth
);
636 static void handle_auth_reply(struct ceph_mon_client
*monc
,
637 struct ceph_msg
*msg
)
641 mutex_lock(&monc
->mutex
);
642 monc
->pending_auth
= 0;
643 ret
= ceph_handle_auth_reply(monc
->auth
, msg
->front
.iov_base
,
645 monc
->m_auth
->front
.iov_base
,
646 monc
->m_auth
->front_max
);
648 monc
->client
->auth_err
= ret
;
649 wake_up(&monc
->client
->auth_wq
);
650 } else if (ret
> 0) {
651 __send_prepared_auth_request(monc
, ret
);
652 } else if (monc
->auth
->ops
->is_authenticated(monc
->auth
)) {
653 dout("authenticated, starting session\n");
655 monc
->client
->msgr
->inst
.name
.type
= CEPH_ENTITY_TYPE_CLIENT
;
656 monc
->client
->msgr
->inst
.name
.num
= monc
->auth
->global_id
;
658 __send_subscribe(monc
);
659 __resend_statfs(monc
);
661 mutex_unlock(&monc
->mutex
);
664 static int __validate_auth(struct ceph_mon_client
*monc
)
668 if (monc
->pending_auth
)
671 ret
= ceph_build_auth(monc
->auth
, monc
->m_auth
->front
.iov_base
,
672 monc
->m_auth
->front_max
);
674 return ret
; /* either an error, or no need to authenticate */
675 __send_prepared_auth_request(monc
, ret
);
679 int ceph_monc_validate_auth(struct ceph_mon_client
*monc
)
683 mutex_lock(&monc
->mutex
);
684 ret
= __validate_auth(monc
);
685 mutex_unlock(&monc
->mutex
);
690 * handle incoming message
692 static void dispatch(struct ceph_connection
*con
, struct ceph_msg
*msg
)
694 struct ceph_mon_client
*monc
= con
->private;
695 int type
= le16_to_cpu(msg
->hdr
.type
);
701 case CEPH_MSG_AUTH_REPLY
:
702 handle_auth_reply(monc
, msg
);
705 case CEPH_MSG_MON_SUBSCRIBE_ACK
:
706 handle_subscribe_ack(monc
, msg
);
709 case CEPH_MSG_STATFS_REPLY
:
710 handle_statfs_reply(monc
, msg
);
713 case CEPH_MSG_MON_MAP
:
714 ceph_monc_handle_map(monc
, msg
);
717 case CEPH_MSG_MDS_MAP
:
718 ceph_mdsc_handle_map(&monc
->client
->mdsc
, msg
);
721 case CEPH_MSG_OSD_MAP
:
722 ceph_osdc_handle_map(&monc
->client
->osdc
, msg
);
726 pr_err("received unknown message type %d %s\n", type
,
727 ceph_msg_type_name(type
));
733 * Allocate memory for incoming message
735 static struct ceph_msg
*mon_alloc_msg(struct ceph_connection
*con
,
736 struct ceph_msg_header
*hdr
,
739 struct ceph_mon_client
*monc
= con
->private;
740 int type
= le16_to_cpu(hdr
->type
);
741 int front_len
= le32_to_cpu(hdr
->front_len
);
747 case CEPH_MSG_MON_SUBSCRIBE_ACK
:
748 m
= ceph_msgpool_get(&monc
->msgpool_subscribe_ack
, front_len
);
750 case CEPH_MSG_STATFS_REPLY
:
751 m
= ceph_msgpool_get(&monc
->msgpool_statfs_reply
, front_len
);
753 case CEPH_MSG_AUTH_REPLY
:
754 m
= ceph_msgpool_get(&monc
->msgpool_auth_reply
, front_len
);
767 * If the monitor connection resets, pick a new monitor and resubmit
768 * any pending requests.
770 static void mon_fault(struct ceph_connection
*con
)
772 struct ceph_mon_client
*monc
= con
->private;
778 mutex_lock(&monc
->mutex
);
782 if (monc
->con
&& !monc
->hunting
)
783 pr_info("mon%d %s session lost, "
784 "hunting for new mon\n", monc
->cur_mon
,
785 pr_addr(&monc
->con
->peer_addr
.in_addr
));
787 __close_session(monc
);
788 if (!monc
->hunting
) {
790 monc
->hunting
= true;
791 __open_session(monc
);
793 /* already hunting, let's wait a bit */
794 __schedule_delayed(monc
);
797 mutex_unlock(&monc
->mutex
);
800 const static struct ceph_connection_operations mon_con_ops
= {
803 .dispatch
= dispatch
,
805 .alloc_msg
= mon_alloc_msg
,