Fix: check for removal of session's shm_path in destroy()

author Jérémie Galarneau <jeremie.galarneau@efficios.com>

Wed, 13 Jun 2018 00:13:48 +0000 (20:13 -0400)

committer Jérémie Galarneau <jeremie.galarneau@efficios.com>

Wed, 13 Jun 2018 21:27:01 +0000 (17:27 -0400)
author Jérémie Galarneau <jeremie.galarneau@efficios.com>
Wed, 13 Jun 2018 00:13:48 +0000 (20:13 -0400)
committer Jérémie Galarneau <jeremie.galarneau@efficios.com>
Wed, 13 Jun 2018 21:27:01 +0000 (17:27 -0400)
diff --git a/src/bin/lttng-sessiond/cmd.c b/src/bin/lttng-sessiond/cmd.c

index 6722341bb4b1be79f11a88b96c8989a3cf15e734..65a24e4dcdbd492e3abe14969a6f92be45e5486c 100644 (file)
--- a/src/bin/lttng-sessiond/cmd.c
+++ b/src/bin/lttng-sessiond/cmd.c
@@ -21,6 +21,7 @@
  #include <inttypes.h>
  #include <urcu/list.h>
  #include <urcu/uatomic.h>
+#include <sys/stat.h>
  
  #include <common/defaults.h>
  #include <common/common.h>
@@ -59,6 +60,30 @@
  
  #include "cmd.h"
  
+/* Sleep for 100ms between each check for the shm path's deletion. */
+#define SESSION_DESTROY_SHM_PATH_CHECK_DELAY_US 100000
+
+static enum lttng_error_code wait_on_path(void *path);
+
+/*
+ * Command completion handler that is used by the destroy command
+ * when a session that has a non-default shm_path is being destroyed.
+ *
+ * See comment in cmd_destroy_session() for the rationale.
+ */
+static struct destroy_completion_handler {
+       struct cmd_completion_handler handler;
+       char shm_path[member_sizeof(struct ltt_session, shm_path)];
+} destroy_completion_handler = {
+       .handler = {
+               .run = wait_on_path,
+               .data = destroy_completion_handler.shm_path
+       },
+       .shm_path = { 0 },
+};
+
+static struct cmd_completion_handler *current_completion_handler;
+
  /*
   * Used to keep a unique index for each relayd socket created where this value
   * is associated with streams on the consumer so it can match the right relayd
@@ -3030,6 +3055,59 @@ int cmd_destroy_session(struct ltt_session *session, int wpipe,
                 PERROR("write kernel poll pipe");
         }
  
+       if (session->shm_path[0]) {
+               /*
+                * When a session is created with an explicit shm_path,
+                * the consumer daemon will create its shared memory files
+                * at that location and will *not* unlink them. This is normal
+                * as the intention of that feature is to make it possible
+                * to retrieve the content of those files should a crash occur.
+                *
+                * To ensure the content of those files can be used, the
+                * sessiond daemon will replicate the content of the metadata
+                * cache in a metadata file.
+                *
+                * On clean-up, it is expected that the consumer daemon will
+                * unlink the shared memory files and that the session daemon
+                * will unlink the metadata file. Then, the session's directory
+                * in the shm path can be removed.
+                *
+                * Unfortunately, a flaw in the design of the sessiond's and
+                * consumerd's tear down of channels makes it impossible to
+                * determine when the sessiond _and_ the consumerd have both
+                * destroyed their representation of a channel. For one, the
+                * unlinking, close, and rmdir happen in deferred 'call_rcu'
+                * callbacks in both daemons.
+                *
+                * However, it is also impossible for the sessiond to know when
+                * the consumer daemon is done destroying its channel(s) since
+                * it occurs as a reaction to the closing of the channel's file
+                * descriptor. There is no resulting communication initiated
+                * from the consumerd to the sessiond to confirm that the
+                * operation is completed (and was successful).
+                *
+                * Until this is all fixed, the session daemon checks for the
+                * removal of the session's shm path which makes it possible
+                * to safely advertise a session as having been destroyed.
+                *
+                * Prior to this fix, it was not possible to reliably save
+                * a session making use of the --shm-path option, destroy it,
+                * and load it again. This is because the creation of the
+                * session would fail upon seeing the session's shm path
+                * already in existence.
+                *
+                * Note that none of the error paths in the check for the
+                * directory's existence return an error. This is normal
+                * as there isn't much that can be done. The session will
+                * be destroyed properly, except that we can't offer the
+                * guarantee that the same session can be re-created.
+                */
+               current_completion_handler = &destroy_completion_handler.handler;
+               ret = lttng_strncpy(destroy_completion_handler.shm_path,
+                               session->shm_path,
+                               sizeof(destroy_completion_handler.shm_path));
+               assert(!ret);
+       }
         ret = session_destroy(session);
  
         return ret;
@@ -4864,6 +4942,49 @@ end:
         return ret;
  }
  
+/* Wait for a given path to be removed before continuing. */
+static enum lttng_error_code wait_on_path(void *path_data)
+{
+       const char *shm_path = path_data;
+
+       DBG("Waiting for the shm path at %s to be removed before completing session destruction",
+                       shm_path);
+       while (true) {
+               int ret;
+               struct stat st;
+
+               ret = stat(shm_path, &st);
+               if (ret) {
+                       if (errno != ENOENT) {
+                               PERROR("stat() returned an error while checking for the existence of the shm path");
+                       } else {
+                               DBG("shm path no longer exists, completing the destruction of session");
+                       }
+                       break;
+               } else {
+                       if (!S_ISDIR(st.st_mode)) {
+                               ERR("The type of shm path %s returned by stat() is not a directory; aborting the wait for shm path removal",
+                                               shm_path);
+                               break;
+                       }
+               }
+               usleep(SESSION_DESTROY_SHM_PATH_CHECK_DELAY_US);
+       }
+       return LTTNG_OK;
+}
+
+/*
+ * Returns a pointer to a handler to run on completion of a command.
+ * Returns NULL if no handler has to be run for the last command executed.
+ */
+const struct cmd_completion_handler *cmd_pop_completion_handler(void)
+{
+       struct cmd_completion_handler *handler = current_completion_handler;
+
+       current_completion_handler = NULL;
+       return handler;
+}
+
  /*
   * Init command subsystem.
   */
diff --git a/src/bin/lttng-sessiond/cmd.h b/src/bin/lttng-sessiond/cmd.h

index 6a6c87eaa0b6ea303f11afb63e27c8243bc23857..79ff689c33c970ab25298aed7e9699bea58def83 100644 (file)
--- a/src/bin/lttng-sessiond/cmd.h
+++ b/src/bin/lttng-sessiond/cmd.h
@@ -23,6 +23,20 @@
  
  struct notification_thread_handle;
  
+/*
+ * A callback (and associated user data) that should be run after a command
+ * has been executed. No locks should be taken while executing this handler.
+ *
+ * The command's reply should not be sent until the handler has run and
+ * completed successfully. On failure, the handler's return code should
+ * be the only reply sent to the client.
+ */
+typedef enum lttng_error_code (*completion_handler_function)(void *);
+struct cmd_completion_handler {
+       completion_handler_function run;
+       void *data;
+};
+
  /*
   * Init the command subsystem. Must be called before using any of the functions
   * above. This is called in the main() of the session daemon.
@@ -130,4 +144,6 @@ int cmd_rotation_set_schedule(struct ltt_session *session, uint64_t timer_us,
                 uint64_t size,
                 struct notification_thread_handle *notification_thread_handle);
  
+const struct cmd_completion_handler *cmd_pop_completion_handler(void);
+
  #endif /* CMD_H */
diff --git a/src/bin/lttng-sessiond/main.c b/src/bin/lttng-sessiond/main.c

index 2b2a59adeb589e12a5449ef546df11a129721248..5e644a72d9f821ec638d31e752542fa42cf2cffb 100644 (file)
--- a/src/bin/lttng-sessiond/main.c
+++ b/src/bin/lttng-sessiond/main.c
@@ -4598,6 +4598,8 @@ static void *thread_manage_clients(void *data)
         health_code_update();
  
         while (1) {
+               const struct cmd_completion_handler *cmd_completion_handler;
+
                 DBG("Accepting client command ...");
  
                 /* Inifinite blocking call, waiting for transmission */
@@ -4740,6 +4742,18 @@ static void *thread_manage_clients(void *data)
                         continue;
                 }
  
+               cmd_completion_handler = cmd_pop_completion_handler();
+               if (cmd_completion_handler) {
+                       enum lttng_error_code completion_code;
+
+                       completion_code = cmd_completion_handler->run(
+                                       cmd_completion_handler->data);
+                       if (completion_code != LTTNG_OK) {
+                               clean_command_ctx(&cmd_ctx);
+                               continue;
+                       }
+               }
+
                 health_code_update();
  
                 DBG("Sending response (size: %d, retcode: %s (%d))",
author	Jérémie Galarneau <jeremie.galarneau@efficios.com>
	Wed, 13 Jun 2018 00:13:48 +0000 (20:13 -0400)
committer	Jérémie Galarneau <jeremie.galarneau@efficios.com>
	Wed, 13 Jun 2018 21:27:01 +0000 (17:27 -0400)
src/bin/lttng-sessiond/cmd.c		patch \| blob \| blame \| history
src/bin/lttng-sessiond/cmd.h		patch \| blob \| blame \| history
src/bin/lttng-sessiond/main.c		patch \| blob \| blame \| history