From 2961f09e9ea0e7a23dade01014caa40fb323d6bd Mon Sep 17 00:00:00 2001 From: =?utf8?q?J=C3=A9r=C3=A9mie=20Galarneau?= Date: Wed, 14 Nov 2018 16:52:12 -0500 Subject: [PATCH] Fix: rotation error may leave session in "ONGOING" state MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The errors that can occur during the launch of a rotation may leave a session's rotation state in the "ONGOING" state. This means that clients polling for the rotation's state (or using the notification channel) will never see the rotation enter the ERROR or COMPLETED states, resulting in a hang. This change introduces session_reset_rotation_state() which implements the logic needed to reset a session's rotation state and populate the result of the last rotation. Signed-off-by: Jérémie Galarneau --- src/bin/lttng-sessiond/cmd.c | 73 +++++++++++++++--------- src/bin/lttng-sessiond/rotation-thread.c | 21 ++++++- src/bin/lttng-sessiond/session.c | 33 +++++++++++ src/bin/lttng-sessiond/session.h | 3 + 4 files changed, 101 insertions(+), 29 deletions(-) diff --git a/src/bin/lttng-sessiond/cmd.c b/src/bin/lttng-sessiond/cmd.c index 545269525..4d4d0088c 100644 --- a/src/bin/lttng-sessiond/cmd.c +++ b/src/bin/lttng-sessiond/cmd.c @@ -4531,6 +4531,11 @@ int cmd_rotate_session(struct ltt_session *session, struct tm *timeinfo; char datetime[21]; time_t now; + /* + * Used to roll-back timestamps in case of failure to launch the + * rotation. + */ + time_t original_last_chunk_start_ts, original_current_chunk_start_ts; assert(session); @@ -4611,29 +4616,17 @@ int cmd_rotate_session(struct ltt_session *session, * archive id. */ session->current_archive_id++; - /* - * A rotation has a local step even if the destination is a relay - * daemon; the buffers must be consumed by the consumer daemon. - */ - session->rotation_pending_local = true; - session->rotation_pending_relay = - session_get_consumer_destination_type(session) == CONSUMER_DST_NET; - session->rotation_state = LTTNG_ROTATION_STATE_ONGOING; - ret = notification_thread_command_session_rotation_ongoing( - notification_thread_handle, - session->name, session->uid, session->gid, - session->current_archive_id - 1); - if (ret != LTTNG_OK) { - ERR("Failed to notify notification thread that a session rotation is ongoing for session %s", - session->name); - } - /* Create the path name for the next chunk. */ now = time(NULL); if (now == (time_t) -1) { ret = -LTTNG_ERR_ROTATION_NOT_AVAILABLE; goto end; } + + /* Sample chunk bounds for roll-back in case of error. */ + original_last_chunk_start_ts = session->last_chunk_start_ts; + original_current_chunk_start_ts = session->current_chunk_start_ts; + session->last_chunk_start_ts = session->current_chunk_start_ts; session->current_chunk_start_ts = now; @@ -4650,6 +4643,16 @@ int cmd_rotate_session(struct ltt_session *session, ret = -LTTNG_ERR_UNK; goto end; } + + /* + * A rotation has a local step even if the destination is a relay + * daemon; the buffers must be consumed by the consumer daemon. + */ + session->rotation_pending_local = true; + session->rotation_pending_relay = + session_get_consumer_destination_type(session) == CONSUMER_DST_NET; + session->rotation_state = LTTNG_ROTATION_STATE_ONGOING; + if (session->kernel_session) { /* * The active path for the next rotation/destroy. @@ -4663,7 +4666,7 @@ int cmd_rotate_session(struct ltt_session *session, if (ret < 0 || ret == sizeof(session->rotation_chunk.active_tracing_path)) { ERR("Failed to format active kernel tracing path in rotate session command"); ret = -LTTNG_ERR_UNK; - goto end; + goto error; } /* * The sub-directory for the consumer @@ -4676,7 +4679,7 @@ int cmd_rotate_session(struct ltt_session *session, if (ret < 0 || ret == sizeof(session->kernel_session->consumer->chunk_path)) { ERR("Failed to format the kernel consumer's sub-directory in rotate session command"); ret = -LTTNG_ERR_UNK; - goto end; + goto error; } /* * Create the new chunk folder, before the rotation begins so we don't @@ -4689,12 +4692,12 @@ int cmd_rotate_session(struct ltt_session *session, ERR("Failed to create kernel session tracing path at %s", session->kernel_session->consumer->chunk_path); ret = -LTTNG_ERR_CREATE_DIR_FAIL; - goto end; + goto error; } ret = kernel_rotate_session(session); if (ret != LTTNG_OK) { ret = -ret; - goto end; + goto error; } } if (session->ust_session) { @@ -4705,7 +4708,7 @@ int cmd_rotate_session(struct ltt_session *session, if (ret < 0) { ERR("Failed to format active UST tracing path in rotate session command"); ret = -LTTNG_ERR_UNK; - goto end; + goto error; } ret = snprintf(session->ust_session->consumer->chunk_path, PATH_MAX, "/%s-%" PRIu64, datetime, @@ -4713,7 +4716,7 @@ int cmd_rotate_session(struct ltt_session *session, if (ret < 0) { ERR("Failed to format the UST consumer's sub-directory in rotate session command"); ret = -LTTNG_ERR_UNK; - goto end; + goto error; } /* * Create the new chunk folder, before the rotation begins so we don't @@ -4724,18 +4727,18 @@ int cmd_rotate_session(struct ltt_session *session, session->ust_session->gid); if (ret) { ret = -LTTNG_ERR_CREATE_DIR_FAIL; - goto end; + goto error; } ret = ust_app_rotate_session(session); if (ret != LTTNG_OK) { - goto end; + goto error; } } ret = timer_session_rotation_pending_check_start(session, DEFAULT_ROTATE_PENDING_TIMER); if (ret) { - goto end; + goto error; } if (!session->active) { @@ -4746,12 +4749,30 @@ int cmd_rotate_session(struct ltt_session *session, rotate_return->rotation_id = session->current_archive_id; } + ret = notification_thread_command_session_rotation_ongoing( + notification_thread_handle, + session->name, session->uid, session->gid, + session->current_archive_id - 1); + if (ret != LTTNG_OK) { + ERR("Failed to notify notification thread that a session rotation is ongoing for session %s", + session->name); + } + DBG("Cmd rotate session %s, archive_id %" PRIu64 " sent", session->name, session->current_archive_id - 1); ret = LTTNG_OK; end: return ret; +error: + session->last_chunk_start_ts = original_last_chunk_start_ts; + session->current_archive_id = original_current_chunk_start_ts; + if (session_reset_rotation_state(session, + LTTNG_ROTATION_STATE_NO_ROTATION)) { + ERR("Failed to reset rotation state of session \"%s\"", + session->name); + } + goto end; } /* diff --git a/src/bin/lttng-sessiond/rotation-thread.c b/src/bin/lttng-sessiond/rotation-thread.c index 3256a1e9b..68a5d402f 100644 --- a/src/bin/lttng-sessiond/rotation-thread.c +++ b/src/bin/lttng-sessiond/rotation-thread.c @@ -431,7 +431,12 @@ end: session->rotation_pending_local = false; } if (ret) { - session->rotation_state = LTTNG_ROTATION_STATE_ERROR; + ret = session_reset_rotation_state(session, + LTTNG_ROTATION_STATE_ERROR); + if (ret) { + ERR("Failed to reset rotation state of session \"%s\"", + session->name); + } } return 0; } @@ -491,7 +496,12 @@ int check_session_rotation_pending_relay(struct ltt_session *session) ERR("[rotation-thread] Encountered an error when checking if rotation of trace archive %" PRIu64 " of session \"%s\" is pending on the relay", session->current_archive_id - 1, session->name); - session->rotation_state = LTTNG_ROTATION_STATE_ERROR; + ret = session_reset_rotation_state(session, + LTTNG_ROTATION_STATE_ERROR); + if (ret) { + ERR("Failed to reset rotation state of session \"%s\"", + session->name); + } rotation_completed = false; } @@ -555,7 +565,12 @@ int check_session_rotation_pending(struct ltt_session *session, /* Rename the completed trace archive's location. */ now = time(NULL); if (now == (time_t) -1) { - session->rotation_state = LTTNG_ROTATION_STATE_ERROR; + ret = session_reset_rotation_state(session, + LTTNG_ROTATION_STATE_ERROR); + if (ret) { + ERR("Failed to reset rotation state of session \"%s\"", + session->name); + } ret = LTTNG_ERR_UNK; goto end; } diff --git a/src/bin/lttng-sessiond/session.c b/src/bin/lttng-sessiond/session.c index 56c38714f..abf61e898 100644 --- a/src/bin/lttng-sessiond/session.c +++ b/src/bin/lttng-sessiond/session.c @@ -571,3 +571,36 @@ int session_access_ok(struct ltt_session *session, uid_t uid, gid_t gid) return 1; } } + +/* + * Set a session's rotation state and reset all associated state. + * + * This function resets the rotation state (check timers, pending + * flags, etc.) and sets the result of the last rotation. The result + * can be queries by a liblttng-ctl client. + * + * Be careful of the result passed to this function. For instance, + * on failure to launch a rotation, a client will expect the rotation + * state to be set to "NO_ROTATION". If an error occured while the + * rotation was "ONGOING", result should be set to "ERROR", which will + * allow a client to report it. + * + * Must be called with the session and session_list locks held. + */ +int session_reset_rotation_state(struct ltt_session *session, + enum lttng_rotation_state result) +{ + int ret = 0; + + ASSERT_LOCKED(ltt_session_list.lock); + ASSERT_LOCKED(session->lock); + + session->rotation_pending_local = false; + session->rotation_pending_relay = false; + session->rotated_after_last_stop = false; + session->rotation_state = result; + if (session->rotation_pending_check_timer_enabled) { + ret = timer_session_rotation_pending_check_stop(session); + } + return ret; +} diff --git a/src/bin/lttng-sessiond/session.h b/src/bin/lttng-sessiond/session.h index 99b8c4711..1f0aeb94e 100644 --- a/src/bin/lttng-sessiond/session.h +++ b/src/bin/lttng-sessiond/session.h @@ -229,4 +229,7 @@ struct ltt_session_list *session_get_list(void); int session_access_ok(struct ltt_session *session, uid_t uid, gid_t gid); +int session_reset_rotation_state(struct ltt_session *session, + enum lttng_rotation_state result); + #endif /* _LTT_SESSION_H */ -- 2.34.1