Fix: check for removal of session's shm_path in destroy()
[lttng-tools.git] / src / bin / lttng-sessiond / cmd.c
index ad635a9a96b631880171e012aeb462311424bade..65a24e4dcdbd492e3abe14969a6f92be45e5486c 100644 (file)
@@ -21,6 +21,7 @@
 #include <inttypes.h>
 #include <urcu/list.h>
 #include <urcu/uatomic.h>
+#include <sys/stat.h>
 
 #include <common/defaults.h>
 #include <common/common.h>
@@ -47,7 +48,7 @@
 #include "kernel-consumer.h"
 #include "lttng-sessiond.h"
 #include "utils.h"
-#include "syscall.h"
+#include "lttng-syscall.h"
 #include "agent.h"
 #include "buffer-registry.h"
 #include "notification-thread.h"
 #include "rotate.h"
 #include "rotation-thread.h"
 #include "sessiond-timer.h"
+#include "agent-thread.h"
 
 #include "cmd.h"
 
+/* Sleep for 100ms between each check for the shm path's deletion. */
+#define SESSION_DESTROY_SHM_PATH_CHECK_DELAY_US 100000
+
+static enum lttng_error_code wait_on_path(void *path);
+
+/*
+ * Command completion handler that is used by the destroy command
+ * when a session that has a non-default shm_path is being destroyed.
+ *
+ * See comment in cmd_destroy_session() for the rationale.
+ */
+static struct destroy_completion_handler {
+       struct cmd_completion_handler handler;
+       char shm_path[member_sizeof(struct ltt_session, shm_path)];
+} destroy_completion_handler = {
+       .handler = {
+               .run = wait_on_path,
+               .data = destroy_completion_handler.shm_path
+       },
+       .shm_path = { 0 },
+};
+
+static struct cmd_completion_handler *current_completion_handler;
+
 /*
  * Used to keep a unique index for each relayd socket created where this value
  * is associated with streams on the consumer so it can match the right relayd
@@ -917,8 +943,8 @@ error:
  *
  * The consumer socket lock must be held by the caller.
  */
-static int send_consumer_relayd_socket(enum lttng_domain_type domain,
-               unsigned int session_id, struct lttng_uri *relayd_uri,
+static int send_consumer_relayd_socket(unsigned int session_id,
+               struct lttng_uri *relayd_uri,
                struct consumer_output *consumer,
                struct consumer_socket *consumer_sock,
                char *session_name, char *hostname, int session_live_timer)
@@ -1003,7 +1029,7 @@ static int send_consumer_relayd_sockets(enum lttng_domain_type domain,
 
        /* Sending control relayd socket. */
        if (!sock->control_sock_sent) {
-               ret = send_consumer_relayd_socket(domain, session_id,
+               ret = send_consumer_relayd_socket(session_id,
                                &consumer->dst.net.control, consumer, sock,
                                session_name, hostname, session_live_timer);
                if (ret != LTTNG_OK) {
@@ -1013,7 +1039,7 @@ static int send_consumer_relayd_sockets(enum lttng_domain_type domain,
 
        /* Sending data relayd socket. */
        if (!sock->data_sock_sent) {
-               ret = send_consumer_relayd_socket(domain, session_id,
+               ret = send_consumer_relayd_socket(session_id,
                                &consumer->dst.net.data, consumer, sock,
                                session_name, hostname, session_live_timer);
                if (ret != LTTNG_OK) {
@@ -1152,7 +1178,7 @@ static int start_kernel_session(struct ltt_kernel_session *ksess, int wpipe)
        }
 
        /* Quiescent wait after starting trace */
-       kernel_wait_quiescent(kernel_tracer_fd);
+       kernel_wait_quiescent(wpipe);
 
        ksess->active = 1;
 
@@ -1384,9 +1410,15 @@ int cmd_enable_channel(struct ltt_session *session,
                break;
        }
        case LTTNG_DOMAIN_UST:
+               break;
        case LTTNG_DOMAIN_JUL:
        case LTTNG_DOMAIN_LOG4J:
        case LTTNG_DOMAIN_PYTHON:
+               if (!agent_tracing_is_enabled()) {
+                       DBG("Attempted to enable a channel in an agent domain but the agent thread is not running");
+                       ret = LTTNG_ERR_AGENT_TRACING_DISABLED;
+                       goto error;
+               }
                break;
        default:
                ret = LTTNG_ERR_UNKNOWN_DOMAIN;
@@ -2094,6 +2126,12 @@ static int _cmd_enable_event(struct ltt_session *session,
 
                assert(usess);
 
+               if (!agent_tracing_is_enabled()) {
+                       DBG("Attempted to enable an event in an agent domain but the agent thread is not running");
+                       ret = LTTNG_ERR_AGENT_TRACING_DISABLED;
+                       goto error;
+               }
+
                agt = trace_ust_find_agent(usess, domain->type);
                if (!agt) {
                        agt = agent_create(domain->type);
@@ -2600,7 +2638,7 @@ int rename_active_chunk(struct ltt_session *session)
 {
        int ret;
 
-       session->rotate_count++;
+       session->current_archive_id++;
 
        /*
         * The currently active tracing path is now the folder we
@@ -2633,7 +2671,7 @@ int rename_active_chunk(struct ltt_session *session)
                goto end;
        }
 end:
-       session->rotate_count--;
+       session->current_archive_id--;
        return ret;
 }
 
@@ -2669,7 +2707,7 @@ int cmd_stop_trace(struct ltt_session *session)
                sessiond_rotate_timer_stop(session);
        }
 
-       if (session->rotate_count > 0 && !session->rotate_pending) {
+       if (session->current_archive_id > 0 && !session->rotate_pending) {
                ret = rename_active_chunk(session);
                if (ret) {
                        /*
@@ -3017,6 +3055,59 @@ int cmd_destroy_session(struct ltt_session *session, int wpipe,
                PERROR("write kernel poll pipe");
        }
 
+       if (session->shm_path[0]) {
+               /*
+                * When a session is created with an explicit shm_path,
+                * the consumer daemon will create its shared memory files
+                * at that location and will *not* unlink them. This is normal
+                * as the intention of that feature is to make it possible
+                * to retrieve the content of those files should a crash occur.
+                *
+                * To ensure the content of those files can be used, the
+                * sessiond daemon will replicate the content of the metadata
+                * cache in a metadata file.
+                *
+                * On clean-up, it is expected that the consumer daemon will
+                * unlink the shared memory files and that the session daemon
+                * will unlink the metadata file. Then, the session's directory
+                * in the shm path can be removed.
+                *
+                * Unfortunately, a flaw in the design of the sessiond's and
+                * consumerd's tear down of channels makes it impossible to
+                * determine when the sessiond _and_ the consumerd have both
+                * destroyed their representation of a channel. For one, the
+                * unlinking, close, and rmdir happen in deferred 'call_rcu'
+                * callbacks in both daemons.
+                *
+                * However, it is also impossible for the sessiond to know when
+                * the consumer daemon is done destroying its channel(s) since
+                * it occurs as a reaction to the closing of the channel's file
+                * descriptor. There is no resulting communication initiated
+                * from the consumerd to the sessiond to confirm that the
+                * operation is completed (and was successful).
+                *
+                * Until this is all fixed, the session daemon checks for the
+                * removal of the session's shm path which makes it possible
+                * to safely advertise a session as having been destroyed.
+                *
+                * Prior to this fix, it was not possible to reliably save
+                * a session making use of the --shm-path option, destroy it,
+                * and load it again. This is because the creation of the
+                * session would fail upon seeing the session's shm path
+                * already in existence.
+                *
+                * Note that none of the error paths in the check for the
+                * directory's existence return an error. This is normal
+                * as there isn't much that can be done. The session will
+                * be destroyed properly, except that we can't offer the
+                * guarantee that the same session can be re-created.
+                */
+               current_completion_handler = &destroy_completion_handler.handler;
+               ret = lttng_strncpy(destroy_completion_handler.shm_path,
+                               session->shm_path,
+                               sizeof(destroy_completion_handler.shm_path));
+               assert(!ret);
+       }
        ret = session_destroy(session);
 
        return ret;
@@ -4430,7 +4521,7 @@ int cmd_rotate_session(struct ltt_session *session,
        }
 
        /* Special case for the first rotation. */
-       if (session->rotate_count == 0) {
+       if (session->current_archive_id == 0) {
                const char *base_path = NULL;
 
                /* Either one of the two sessions is enough to get the root path. */
@@ -4466,7 +4557,7 @@ int cmd_rotate_session(struct ltt_session *session,
        }
        DBG("Current rotate path %s", session->rotation_chunk.current_rotate_path);
 
-       session->rotate_count++;
+       session->current_archive_id++;
        session->rotate_pending = true;
        session->rotation_state = LTTNG_ROTATION_STATE_ONGOING;
 
@@ -4503,7 +4594,7 @@ int cmd_rotate_session(struct ltt_session *session,
                                sizeof(session->rotation_chunk.active_tracing_path),
                                "%s/%s-%" PRIu64,
                                session_get_base_path(session),
-                               datetime, session->rotate_count + 1);
+                               datetime, session->current_archive_id + 1);
                if (ret < 0 || ret == sizeof(session->rotation_chunk.active_tracing_path)) {
                        ERR("Failed to format active kernel tracing path in rotate session command");
                        ret = -LTTNG_ERR_UNK;
@@ -4516,7 +4607,7 @@ int cmd_rotate_session(struct ltt_session *session,
                ret = snprintf(session->kernel_session->consumer->chunk_path,
                                sizeof(session->kernel_session->consumer->chunk_path),
                                "/%s-%" PRIu64, datetime,
-                               session->rotate_count + 1);
+                               session->current_archive_id + 1);
                if (ret < 0 || ret == sizeof(session->kernel_session->consumer->chunk_path)) {
                        ERR("Failed to format the kernel consumer's sub-directory in rotate session command");
                        ret = -LTTNG_ERR_UNK;
@@ -4545,7 +4636,7 @@ int cmd_rotate_session(struct ltt_session *session,
                ret = snprintf(session->rotation_chunk.active_tracing_path,
                                PATH_MAX, "%s/%s-%" PRIu64,
                                session_get_base_path(session),
-                               datetime, session->rotate_count + 1);
+                               datetime, session->current_archive_id + 1);
                if (ret < 0) {
                        ERR("Failed to format active UST tracing path in rotate session command");
                        ret = -LTTNG_ERR_UNK;
@@ -4553,7 +4644,7 @@ int cmd_rotate_session(struct ltt_session *session,
                }
                ret = snprintf(session->ust_session->consumer->chunk_path,
                                PATH_MAX, "/%s-%" PRIu64, datetime,
-                               session->rotate_count + 1);
+                               session->current_archive_id + 1);
                if (ret < 0) {
                        ERR("Failed to format the UST consumer's sub-directory in rotate session command");
                        ret = -LTTNG_ERR_UNK;
@@ -4596,11 +4687,11 @@ int cmd_rotate_session(struct ltt_session *session,
        }
 
        if (rotate_return) {
-               rotate_return->rotation_id = session->rotate_count;
+               rotate_return->rotation_id = session->current_archive_id;
        }
 
-       DBG("Cmd rotate session %s, rotate_id %" PRIu64 " sent", session->name,
-                       session->rotate_count);
+       DBG("Cmd rotate session %s, current_archive_id %" PRIu64 " sent",
+                       session->name, session->current_archive_id);
        ret = LTTNG_OK;
 
 end:
@@ -4623,9 +4714,9 @@ int cmd_rotate_get_info(struct ltt_session *session,
        assert(session);
 
        DBG("Cmd rotate_get_info session %s, rotation id %" PRIu64, session->name,
-                       session->rotate_count);
+                       session->current_archive_id);
 
-       if (session->rotate_count != rotation_id) {
+       if (session->current_archive_id != rotation_id) {
                info_return->status = (int32_t) LTTNG_ROTATION_STATE_EXPIRED;
                ret = LTTNG_OK;
                goto end;
@@ -4648,7 +4739,7 @@ int cmd_rotate_get_info(struct ltt_session *session,
                        current_tracing_path_reply_len =
                                        sizeof(info_return->location.local.absolute_path);
                        info_return->location_type =
-                                       (uint8_t) LTTNG_TRACE_ARCHIVE_LOCATION_TYPE_LOCAL;
+                                       (int8_t) LTTNG_TRACE_ARCHIVE_LOCATION_TYPE_LOCAL;
                        break;
                case CONSUMER_DST_NET:
                        current_tracing_path_reply =
@@ -4657,7 +4748,7 @@ int cmd_rotate_get_info(struct ltt_session *session,
                                        sizeof(info_return->location.relay.relative_path);
                        /* Currently the only supported relay protocol. */
                        info_return->location.relay.protocol =
-                                       (uint8_t) LTTNG_TRACE_ARCHIVE_LOCATION_RELAY_PROTOCOL_TYPE_TCP;
+                                       (int8_t) LTTNG_TRACE_ARCHIVE_LOCATION_RELAY_PROTOCOL_TYPE_TCP;
 
                        ret = lttng_strncpy(info_return->location.relay.host,
                                        session_get_net_consumer_hostname(session),
@@ -4673,7 +4764,7 @@ int cmd_rotate_get_info(struct ltt_session *session,
                                        &info_return->location.relay.ports.control,
                                        &info_return->location.relay.ports.data);
                        info_return->location_type =
-                                       (uint8_t) LTTNG_TRACE_ARCHIVE_LOCATION_TYPE_RELAY;
+                                       (int8_t) LTTNG_TRACE_ARCHIVE_LOCATION_TYPE_RELAY;
                        break;
                default:
                        abort();
@@ -4814,7 +4905,7 @@ int cmd_session_get_current_output(struct ltt_session *session,
        const char *path;
 
        if (!session->snapshot_mode) {
-               if (session->rotate_count == 0) {
+               if (session->current_archive_id == 0) {
                        if (session->kernel_session) {
                                path = session_get_base_path(session);
                        } else if (session->ust_session) {
@@ -4851,6 +4942,49 @@ end:
        return ret;
 }
 
+/* Wait for a given path to be removed before continuing. */
+static enum lttng_error_code wait_on_path(void *path_data)
+{
+       const char *shm_path = path_data;
+
+       DBG("Waiting for the shm path at %s to be removed before completing session destruction",
+                       shm_path);
+       while (true) {
+               int ret;
+               struct stat st;
+
+               ret = stat(shm_path, &st);
+               if (ret) {
+                       if (errno != ENOENT) {
+                               PERROR("stat() returned an error while checking for the existence of the shm path");
+                       } else {
+                               DBG("shm path no longer exists, completing the destruction of session");
+                       }
+                       break;
+               } else {
+                       if (!S_ISDIR(st.st_mode)) {
+                               ERR("The type of shm path %s returned by stat() is not a directory; aborting the wait for shm path removal",
+                                               shm_path);
+                               break;
+                       }
+               }
+               usleep(SESSION_DESTROY_SHM_PATH_CHECK_DELAY_US);
+       }
+       return LTTNG_OK;
+}
+
+/*
+ * Returns a pointer to a handler to run on completion of a command.
+ * Returns NULL if no handler has to be run for the last command executed.
+ */
+const struct cmd_completion_handler *cmd_pop_completion_handler(void)
+{
+       struct cmd_completion_handler *handler = current_completion_handler;
+
+       current_completion_handler = NULL;
+       return handler;
+}
+
 /*
  * Init command subsystem.
  */
This page took 0.031344 seconds and 5 git commands to generate.