Move health into its own common/ static library
[lttng-tools.git] / src / bin / lttng-sessiond / main.c
index 6c6386d2b3b1107255260110175743fa04ccf9bf..550aa1b83f68d65f01e936d1e81fb5308b0b2b4e 100644 (file)
@@ -60,7 +60,7 @@
 #include "ust-consumer.h"
 #include "utils.h"
 #include "fd-limit.h"
-#include "health.h"
+#include "health-sessiond.h"
 #include "testpoint.h"
 #include "ust-thread.h"
 
@@ -233,6 +233,9 @@ static int app_socket_timeout;
 /* Set in main() with the current page size. */
 long page_size;
 
+/* Application health monitoring */
+struct health_app *health_sessiond;
+
 static
 void setup_consumerd_path(void)
 {
@@ -690,9 +693,6 @@ static int update_kernel_stream(struct consumer_data *consumer_data, int fd)
                                        rcu_read_lock();
                                        cds_lfht_for_each_entry(ksess->consumer->socks->ht,
                                                        &iter.iter, socket, node.node) {
-                                               /* Code flow error */
-                                               assert(socket->fd);
-
                                                pthread_mutex_lock(socket->lock);
                                                ret = kernel_consumer_send_channel_stream(socket,
                                                                channel, ksess,
@@ -758,7 +758,7 @@ static void *thread_manage_kernel(void *data)
 
        DBG("[thread] Thread manage kernel started");
 
-       health_register(HEALTH_TYPE_KERNEL);
+       health_register(health_sessiond, HEALTH_TYPE_KERNEL);
 
        /*
         * This first step of the while is to clean this structure which could free
@@ -883,7 +883,7 @@ error_testpoint:
                WARN("Kernel thread died unexpectedly. "
                                "Kernel tracing can continue but CPU hotplug is disabled.");
        }
-       health_unregister();
+       health_unregister(health_sessiond);
        DBG("Kernel thread dying");
        return NULL;
 }
@@ -924,7 +924,7 @@ static void *thread_manage_consumer(void *data)
 
        DBG("[thread] Manage consumer started");
 
-       health_register(HEALTH_TYPE_CONSUMER);
+       health_register(health_sessiond, HEALTH_TYPE_CONSUMER);
 
        health_code_update();
 
@@ -1024,13 +1024,14 @@ restart:
                        lttcomm_connect_unix_sock(consumer_data->cmd_unix_sock_path);
                consumer_data->metadata_fd =
                        lttcomm_connect_unix_sock(consumer_data->cmd_unix_sock_path);
-               if (consumer_data->cmd_sock < 0 || consumer_data->metadata_fd < 0) {
+               if (consumer_data->cmd_sock < 0
+                               || consumer_data->metadata_fd < 0) {
                        PERROR("consumer connect cmd socket");
                        /* On error, signal condition and quit. */
                        signal_consumer_condition(consumer_data, -1);
                        goto error;
                }
-               consumer_data->metadata_sock.fd = &consumer_data->metadata_fd;
+               consumer_data->metadata_sock.fd_ptr = &consumer_data->metadata_fd;
                /* Create metadata socket lock. */
                consumer_data->metadata_sock.lock = zmalloc(sizeof(pthread_mutex_t));
                if (consumer_data->metadata_sock.lock == NULL) {
@@ -1143,8 +1144,8 @@ exit:
 error:
        /*
         * We lock here because we are about to close the sockets and some other
-        * thread might be using them so wait before we are exclusive which will
-        * abort all other consumer command by other threads.
+        * thread might be using them so get exclusive access which will abort all
+        * other consumer command by other threads.
         */
        pthread_mutex_lock(&consumer_data->lock);
 
@@ -1173,8 +1174,8 @@ error:
                }
                consumer_data->cmd_sock = -1;
        }
-       if (*consumer_data->metadata_sock.fd >= 0) {
-               ret = close(*consumer_data->metadata_sock.fd);
+       if (*consumer_data->metadata_sock.fd_ptr >= 0) {
+               ret = close(*consumer_data->metadata_sock.fd_ptr);
                if (ret) {
                        PERROR("close");
                }
@@ -1191,6 +1192,7 @@ error:
        unlink(consumer_data->cmd_unix_sock_path);
        consumer_data->pid = 0;
        pthread_mutex_unlock(&consumer_data->lock);
+
        /* Cleanup metadata socket mutex. */
        pthread_mutex_destroy(consumer_data->metadata_sock.lock);
        free(consumer_data->metadata_sock.lock);
@@ -1201,7 +1203,7 @@ error_poll:
                health_error();
                ERR("Health error occurred in %s", __func__);
        }
-       health_unregister();
+       health_unregister(health_sessiond);
        DBG("consumer thread cleanup completed");
 
        return NULL;
@@ -1221,7 +1223,7 @@ static void *thread_manage_apps(void *data)
        rcu_register_thread();
        rcu_thread_online();
 
-       health_register(HEALTH_TYPE_APP_MANAGE);
+       health_register(health_sessiond, HEALTH_TYPE_APP_MANAGE);
 
        if (testpoint(thread_manage_apps)) {
                goto error_testpoint;
@@ -1367,7 +1369,7 @@ error_testpoint:
                health_error();
                ERR("Health error occurred in %s", __func__);
        }
-       health_unregister();
+       health_unregister(health_sessiond);
        DBG("Application communication apps thread cleanup complete");
        rcu_thread_offline();
        rcu_unregister_thread();
@@ -1378,6 +1380,9 @@ error_testpoint:
  * Send a socket to a thread This is called from the dispatch UST registration
  * thread once all sockets are set for the application.
  *
+ * The sock value can be invalid, we don't really care, the thread will handle
+ * it and make the necessary cleanup if so.
+ *
  * On success, return 0 else a negative value being the errno message of the
  * write().
  */
@@ -1385,9 +1390,14 @@ static int send_socket_to_thread(int fd, int sock)
 {
        int ret;
 
-       /* Sockets MUST be set or else this should not have been called. */
-       assert(fd >= 0);
-       assert(sock >= 0);
+       /*
+        * It's possible that the FD is set as invalid with -1 concurrently just
+        * before calling this function being a shutdown state of the thread.
+        */
+       if (fd < 0) {
+               ret = -EBADF;
+               goto error;
+       }
 
        do {
                ret = write(fd, &sock, sizeof(sock));
@@ -1505,7 +1515,7 @@ static void *thread_dispatch_ust_registration(void *data)
                .count = 0,
        };
 
-       health_register(HEALTH_TYPE_APP_REG_DISPATCH);
+       health_register(health_sessiond, HEALTH_TYPE_APP_REG_DISPATCH);
 
        health_code_update();
 
@@ -1650,7 +1660,12 @@ static void *thread_dispatch_ust_registration(void *data)
                                if (ret < 0) {
                                        rcu_read_unlock();
                                        session_unlock_list();
-                                       /* No notify thread, stop the UST tracing. */
+                                       /*
+                                        * No notify thread, stop the UST tracing. However, this is
+                                        * not an internal error of the this thread thus setting
+                                        * the health error code to a normal exit.
+                                        */
+                                       err = 0;
                                        goto error;
                                }
 
@@ -1675,7 +1690,12 @@ static void *thread_dispatch_ust_registration(void *data)
                                if (ret < 0) {
                                        rcu_read_unlock();
                                        session_unlock_list();
-                                       /* No apps. thread, stop the UST tracing. */
+                                       /*
+                                        * No apps. thread, stop the UST tracing. However, this is
+                                        * not an internal error of the this thread thus setting
+                                        * the health error code to a normal exit.
+                                        */
+                                       err = 0;
                                        goto error;
                                }
 
@@ -1706,7 +1726,7 @@ error:
                health_error();
                ERR("Health error occurred in %s", __func__);
        }
-       health_unregister();
+       health_unregister(health_sessiond);
        return NULL;
 }
 
@@ -1726,7 +1746,7 @@ static void *thread_registration_apps(void *data)
 
        DBG("[thread] Manage application registration started");
 
-       health_register(HEALTH_TYPE_APP_REG);
+       health_register(health_sessiond, HEALTH_TYPE_APP_REG);
 
        if (testpoint(thread_registration_apps)) {
                goto error_testpoint;
@@ -1906,7 +1926,7 @@ error_listen:
 error_create_poll:
 error_testpoint:
        DBG("UST Registration thread cleanup complete");
-       health_unregister();
+       health_unregister(health_sessiond);
 
        return NULL;
 }
@@ -2283,7 +2303,7 @@ static int check_consumer_health(void)
 {
        int ret;
 
-       ret = health_check_state(HEALTH_TYPE_CONSUMER);
+       ret = health_check_state(health_sessiond, HEALTH_TYPE_CONSUMER);
 
        DBG3("Health consumer check %d", ret);
 
@@ -2451,6 +2471,7 @@ static int create_ust_session(struct ltt_session *session,
        lus->gid = session->gid;
        lus->output_traces = session->output_traces;
        lus->snapshot_mode = session->snapshot_mode;
+       lus->live_timer_interval = session->live_timer;
        session->ust_session = lus;
 
        /* Copy session output to the newly created UST session */
@@ -2565,6 +2586,7 @@ static int process_client_msg(struct command_ctx *cmd_ctx, int sock,
        switch (cmd_ctx->lsm->cmd_type) {
        case LTTNG_CREATE_SESSION:
        case LTTNG_CREATE_SESSION_SNAPSHOT:
+       case LTTNG_CREATE_SESSION_LIVE:
        case LTTNG_DESTROY_SESSION:
        case LTTNG_LIST_SESSIONS:
        case LTTNG_LIST_DOMAINS:
@@ -2628,6 +2650,7 @@ static int process_client_msg(struct command_ctx *cmd_ctx, int sock,
        switch (cmd_ctx->lsm->cmd_type) {
        case LTTNG_CREATE_SESSION:
        case LTTNG_CREATE_SESSION_SNAPSHOT:
+       case LTTNG_CREATE_SESSION_LIVE:
        case LTTNG_CALIBRATE:
        case LTTNG_LIST_SESSIONS:
        case LTTNG_LIST_TRACEPOINTS:
@@ -3072,7 +3095,7 @@ skip_domain:
                }
 
                ret = cmd_create_session_uri(cmd_ctx->lsm->session.name, uris, nb_uri,
-                       &cmd_ctx->creds);
+                       &cmd_ctx->creds, 0);
 
                free(uris);
 
@@ -3362,6 +3385,45 @@ skip_domain:
                free(uris);
                break;
        }
+       case LTTNG_CREATE_SESSION_LIVE:
+       {
+               size_t nb_uri, len;
+               struct lttng_uri *uris = NULL;
+
+               nb_uri = cmd_ctx->lsm->u.uri.size;
+               len = nb_uri * sizeof(struct lttng_uri);
+
+               if (nb_uri > 0) {
+                       uris = zmalloc(len);
+                       if (uris == NULL) {
+                               ret = LTTNG_ERR_FATAL;
+                               goto error;
+                       }
+
+                       /* Receive variable len data */
+                       DBG("Waiting for %zu URIs from client ...", nb_uri);
+                       ret = lttcomm_recv_unix_sock(sock, uris, len);
+                       if (ret <= 0) {
+                               DBG("No URIs received from client... continuing");
+                               *sock_error = 1;
+                               ret = LTTNG_ERR_SESSION_FAIL;
+                               free(uris);
+                               goto error;
+                       }
+
+                       if (nb_uri == 1 && uris[0].dtype != LTTNG_DST_PATH) {
+                               DBG("Creating session with ONE network URI is a bad call");
+                               ret = LTTNG_ERR_SESSION_FAIL;
+                               free(uris);
+                               goto error;
+                       }
+               }
+
+               ret = cmd_create_session_uri(cmd_ctx->lsm->session.name, uris,
+                               nb_uri, &cmd_ctx->creds, cmd_ctx->lsm->u.session_live.timer_interval);
+               free(uris);
+               break;
+       }
        default:
                ret = LTTNG_ERR_UND;
                break;
@@ -3505,39 +3567,39 @@ restart:
 
                switch (msg.component) {
                case LTTNG_HEALTH_CMD:
-                       reply.ret_code = health_check_state(HEALTH_TYPE_CMD);
+                       reply.ret_code = health_check_state(health_sessiond, HEALTH_TYPE_CMD);
                        break;
                case LTTNG_HEALTH_APP_MANAGE:
-                       reply.ret_code = health_check_state(HEALTH_TYPE_APP_MANAGE);
+                       reply.ret_code = health_check_state(health_sessiond, HEALTH_TYPE_APP_MANAGE);
                        break;
                case LTTNG_HEALTH_APP_REG:
-                       reply.ret_code = health_check_state(HEALTH_TYPE_APP_REG);
+                       reply.ret_code = health_check_state(health_sessiond, HEALTH_TYPE_APP_REG);
                        break;
                case LTTNG_HEALTH_KERNEL:
-                       reply.ret_code = health_check_state(HEALTH_TYPE_KERNEL);
+                       reply.ret_code = health_check_state(health_sessiond, HEALTH_TYPE_KERNEL);
                        break;
                case LTTNG_HEALTH_CONSUMER:
                        reply.ret_code = check_consumer_health();
                        break;
                case LTTNG_HEALTH_HT_CLEANUP:
-                       reply.ret_code = health_check_state(HEALTH_TYPE_HT_CLEANUP);
+                       reply.ret_code = health_check_state(health_sessiond, HEALTH_TYPE_HT_CLEANUP);
                        break;
                case LTTNG_HEALTH_APP_MANAGE_NOTIFY:
-                       reply.ret_code = health_check_state(HEALTH_TYPE_APP_MANAGE_NOTIFY);
+                       reply.ret_code = health_check_state(health_sessiond, HEALTH_TYPE_APP_MANAGE_NOTIFY);
                        break;
                case LTTNG_HEALTH_APP_REG_DISPATCH:
-                       reply.ret_code = health_check_state(HEALTH_TYPE_APP_REG_DISPATCH);
+                       reply.ret_code = health_check_state(health_sessiond, HEALTH_TYPE_APP_REG_DISPATCH);
                        break;
                case LTTNG_HEALTH_ALL:
                        reply.ret_code =
-                               health_check_state(HEALTH_TYPE_APP_MANAGE) &&
-                               health_check_state(HEALTH_TYPE_APP_REG) &&
-                               health_check_state(HEALTH_TYPE_CMD) &&
-                               health_check_state(HEALTH_TYPE_KERNEL) &&
+                               health_check_state(health_sessiond, HEALTH_TYPE_APP_MANAGE) &&
+                               health_check_state(health_sessiond, HEALTH_TYPE_APP_REG) &&
+                               health_check_state(health_sessiond, HEALTH_TYPE_CMD) &&
+                               health_check_state(health_sessiond, HEALTH_TYPE_KERNEL) &&
                                check_consumer_health() &&
-                               health_check_state(HEALTH_TYPE_HT_CLEANUP) &&
-                               health_check_state(HEALTH_TYPE_APP_MANAGE_NOTIFY) &&
-                               health_check_state(HEALTH_TYPE_APP_REG_DISPATCH);
+                               health_check_state(health_sessiond, HEALTH_TYPE_HT_CLEANUP) &&
+                               health_check_state(health_sessiond, HEALTH_TYPE_APP_MANAGE_NOTIFY) &&
+                               health_check_state(health_sessiond, HEALTH_TYPE_APP_REG_DISPATCH);
                        break;
                default:
                        reply.ret_code = LTTNG_ERR_UND;
@@ -3605,7 +3667,7 @@ static void *thread_manage_clients(void *data)
 
        rcu_register_thread();
 
-       health_register(HEALTH_TYPE_CMD);
+       health_register(health_sessiond, HEALTH_TYPE_CMD);
 
        if (testpoint(thread_manage_clients)) {
                goto error_testpoint;
@@ -3829,7 +3891,7 @@ error_testpoint:
                ERR("Health error occurred in %s", __func__);
        }
 
-       health_unregister();
+       health_unregister(health_sessiond);
 
        DBG("Client thread dying");
 
@@ -4653,6 +4715,18 @@ int main(int argc, char **argv)
 
        /* Initialize communication library */
        lttcomm_init();
+       /* This is to get the TCP timeout value. */
+       lttcomm_inet_init();
+
+       /*
+        * Initialize the health check subsystem. This call should set the
+        * appropriate time values.
+        */
+       health_sessiond = health_app_create(HEALTH_NUM_TYPE);
+       if (!health_sessiond) {
+               PERROR("health_app_create error");
+               goto exit_health_sessiond_cleanup;
+       }
 
        /* Create thread to manage the client socket */
        ret = pthread_create(&ht_cleanup_thread, NULL,
@@ -4707,7 +4781,7 @@ int main(int argc, char **argv)
                        ust_thread_manage_notify, (void *) NULL);
        if (ret != 0) {
                PERROR("pthread_create apps");
-               goto exit_apps;
+               goto exit_apps_notify;
        }
 
        /* Don't start this thread if kernel tracing is not requested nor root */
@@ -4728,12 +4802,20 @@ int main(int argc, char **argv)
        }
 
 exit_kernel:
+       ret = pthread_join(apps_notify_thread, &status);
+       if (ret != 0) {
+               PERROR("pthread_join apps notify");
+               goto error;     /* join error, exit without cleanup */
+       }
+
+exit_apps_notify:
        ret = pthread_join(apps_thread, &status);
        if (ret != 0) {
-               PERROR("pthread_join");
+               PERROR("pthread_join apps");
                goto error;     /* join error, exit without cleanup */
        }
 
+
 exit_apps:
        ret = pthread_join(reg_apps_thread, &status);
        if (ret != 0) {
@@ -4787,6 +4869,8 @@ exit_health:
                goto error;     /* join error, exit without cleanup */
        }
 exit_ht_cleanup:
+       health_app_destroy(health_sessiond);
+exit_health_sessiond_cleanup:
 exit:
        /*
         * cleanup() is called when no other thread is running.
This page took 0.030939 seconds and 5 git commands to generate.