Implement health check for app registration dispatch
[lttng-tools.git] / src / bin / lttng-sessiond / main.c
index 9527371380e02c45011ad60a0f806872e194a861..007c722921700e351cfedd6441fc31a16a9683b3 100644 (file)
@@ -161,6 +161,7 @@ static pthread_t client_thread;
 static pthread_t kernel_thread;
 static pthread_t dispatch_thread;
 static pthread_t health_thread;
+static pthread_t ht_cleanup_thread;
 
 /*
  * UST registration command queue. This queue is tied with a futex and uses a N
@@ -705,9 +706,9 @@ static void *thread_manage_kernel(void *data)
 
        /*
         * This first step of the while is to clean this structure which could free
-        * non NULL pointers so zero it before the loop.
+        * non NULL pointers so initialize it before the loop.
         */
-       memset(&events, 0, sizeof(events));
+       lttng_poll_init(&events);
 
        if (testpoint(thread_manage_kernel)) {
                goto error_testpoint;
@@ -1339,7 +1340,7 @@ error:
  */
 static void *thread_dispatch_ust_registration(void *data)
 {
-       int ret;
+       int ret, err = -1;
        struct cds_wfq_node *node;
        struct ust_command *ust_cmd = NULL;
        struct {
@@ -1347,11 +1348,17 @@ static void *thread_dispatch_ust_registration(void *data)
                struct cds_list_head head;
        } *wait_node = NULL, *tmp_wait_node;
 
+       health_register(HEALTH_TYPE_APP_REG_DISPATCH);
+
+       health_code_update();
+
        CDS_LIST_HEAD(wait_queue);
 
        DBG("[thread] Dispatch UST command started");
 
        while (!CMM_LOAD_SHARED(dispatch_thread_exit)) {
+               health_code_update();
+
                /* Atomically prepare the queue futex */
                futex_nto1_prepare(&ust_cmd_queue.futex);
 
@@ -1359,6 +1366,7 @@ static void *thread_dispatch_ust_registration(void *data)
                        struct ust_app *app = NULL;
                        ust_cmd = NULL;
 
+                       health_code_update();
                        /* Dequeue command for registration */
                        node = cds_wfq_dequeue_blocking(&ust_cmd_queue.queue);
                        if (node == NULL) {
@@ -1380,6 +1388,11 @@ static void *thread_dispatch_ust_registration(void *data)
                                wait_node = zmalloc(sizeof(*wait_node));
                                if (!wait_node) {
                                        PERROR("zmalloc wait_node dispatch");
+                                       ret = close(ust_cmd->sock);
+                                       if (ret < 0) {
+                                               PERROR("close ust sock dispatch %d", ust_cmd->sock);
+                                       }
+                                       lttng_fd_put(1, LTTNG_FD_APPS);
                                        free(ust_cmd);
                                        goto error;
                                }
@@ -1418,6 +1431,7 @@ static void *thread_dispatch_ust_registration(void *data)
                                 */
                                cds_list_for_each_entry_safe(wait_node, tmp_wait_node,
                                                &wait_queue, head) {
+                                       health_code_update();
                                        if (wait_node->app->pid == ust_cmd->reg_msg.pid) {
                                                wait_node->app->notify_sock = ust_cmd->sock;
                                                cds_list_del(&wait_node->head);
@@ -1427,6 +1441,19 @@ static void *thread_dispatch_ust_registration(void *data)
                                                break;
                                        }
                                }
+
+                               /*
+                                * With no application at this stage the received socket is
+                                * basically useless so close it before we free the cmd data
+                                * structure for good.
+                                */
+                               if (!app) {
+                                       ret = close(ust_cmd->sock);
+                                       if (ret < 0) {
+                                               PERROR("close ust sock dispatch %d", ust_cmd->sock);
+                                       }
+                                       lttng_fd_put(1, LTTNG_FD_APPS);
+                               }
                                free(ust_cmd);
                        }
 
@@ -1488,19 +1515,16 @@ static void *thread_dispatch_ust_registration(void *data)
 
                                rcu_read_unlock();
                                session_unlock_list();
-                       } else {
-                               /* Application manager threads are not available. */
-                               ret = close(ust_cmd->sock);
-                               if (ret < 0) {
-                                       PERROR("close ust_cmd sock");
-                               }
-                               lttng_fd_put(1, LTTNG_FD_APPS);
                        }
                } while (node != NULL);
 
+               health_poll_entry();
                /* Futex wait on queue. Blocking call on futex() */
                futex_nto1_wait(&ust_cmd_queue.futex);
+               health_poll_exit();
        }
+       /* Normal exit, no error */
+       err = 0;
 
 error:
        /* Clean up wait queue. */
@@ -1511,6 +1535,11 @@ error:
        }
 
        DBG("Dispatch thread dying");
+       if (err) {
+               health_error();
+               ERR("Health error occurred in %s", __func__);
+       }
+       health_unregister();
        return NULL;
 }
 
@@ -2895,6 +2924,7 @@ skip_domain:
 
                ret = setup_lttng_msg(cmd_ctx, nb_dom * sizeof(struct lttng_domain));
                if (ret < 0) {
+                       free(domains);
                        goto setup_error;
                }
 
@@ -2922,6 +2952,7 @@ skip_domain:
 
                ret = setup_lttng_msg(cmd_ctx, nb_chan * sizeof(struct lttng_channel));
                if (ret < 0) {
+                       free(channels);
                        goto setup_error;
                }
 
@@ -2949,6 +2980,7 @@ skip_domain:
 
                ret = setup_lttng_msg(cmd_ctx, nb_event * sizeof(struct lttng_event));
                if (ret < 0) {
+                       free(events);
                        goto setup_error;
                }
 
@@ -3094,6 +3126,9 @@ static void *thread_manage_health(void *data)
 
        rcu_register_thread();
 
+       /* We might hit an error path before this is created. */
+       lttng_poll_init(&events);
+
        /* Create unix socket */
        sock = lttcomm_create_unix_sock(health_unix_sock_path);
        if (sock < 0) {
@@ -3208,13 +3243,25 @@ restart:
                case LTTNG_HEALTH_CONSUMER:
                        reply.ret_code = check_consumer_health();
                        break;
+               case LTTNG_HEALTH_HT_CLEANUP:
+                       reply.ret_code = health_check_state(HEALTH_TYPE_HT_CLEANUP);
+                       break;
+               case LTTNG_HEALTH_APP_MANAGE_NOTIFY:
+                       reply.ret_code = health_check_state(HEALTH_TYPE_APP_MANAGE_NOTIFY);
+                       break;
+               case LTTNG_HEALTH_APP_REG_DISPATCH:
+                       reply.ret_code = health_check_state(HEALTH_TYPE_APP_REG_DISPATCH);
+                       break;
                case LTTNG_HEALTH_ALL:
                        reply.ret_code =
                                health_check_state(HEALTH_TYPE_APP_MANAGE) &&
                                health_check_state(HEALTH_TYPE_APP_REG) &&
                                health_check_state(HEALTH_TYPE_CMD) &&
                                health_check_state(HEALTH_TYPE_KERNEL) &&
-                               check_consumer_health();
+                               check_consumer_health() &&
+                               health_check_state(HEALTH_TYPE_HT_CLEANUP) &&
+                               health_check_state(HEALTH_TYPE_APP_MANAGE_NOTIFY) &&
+                               health_check_state(HEALTH_TYPE_APP_REG_DISPATCH);
                        break;
                default:
                        reply.ret_code = LTTNG_ERR_UND;
@@ -3259,12 +3306,6 @@ error:
                        PERROR("close");
                }
        }
-       if (new_sock >= 0) {
-               ret = close(new_sock);
-               if (ret) {
-                       PERROR("close");
-               }
-       }
 
        lttng_poll_clean(&events);
 
@@ -3446,13 +3487,11 @@ static void *thread_manage_clients(void *data)
                ret = process_client_msg(cmd_ctx, sock, &sock_error);
                rcu_thread_offline();
                if (ret < 0) {
-                       if (sock_error) {
-                               ret = close(sock);
-                               if (ret) {
-                                       PERROR("close");
-                               }
-                               sock = -1;
+                       ret = close(sock);
+                       if (ret) {
+                               PERROR("close");
                        }
+                       sock = -1;
                        /*
                         * TODO: Inform client somehow of the fatal error. At
                         * this point, ret < 0 means that a zmalloc failed
@@ -4284,6 +4323,11 @@ int main(int argc, char **argv)
                }
        }
 
+       /* Setup the thread ht_cleanup communication pipe. */
+       if (utils_create_pipe_cloexec(ht_cleanup_pipe) < 0) {
+               goto exit;
+       }
+
        /* Setup the thread apps communication pipe. */
        if ((ret = utils_create_pipe_cloexec(apps_cmd_pipe)) < 0) {
                goto exit;
@@ -4322,6 +4366,14 @@ int main(int argc, char **argv)
 
        write_pidfile();
 
+       /* Create thread to manage the client socket */
+       ret = pthread_create(&ht_cleanup_thread, NULL,
+                       thread_ht_cleanup, (void *) NULL);
+       if (ret != 0) {
+               PERROR("pthread_create ht_cleanup");
+               goto exit_ht_cleanup;
+       }
+
        /* Create thread to manage the client socket */
        ret = pthread_create(&health_thread, NULL,
                        thread_manage_health, (void *) NULL);
@@ -4441,6 +4493,12 @@ exit_client:
        }
 
 exit_health:
+       ret = pthread_join(ht_cleanup_thread, &status);
+       if (ret != 0) {
+               PERROR("pthread_join ht cleanup thread");
+               goto error;     /* join error, exit without cleanup */
+       }
+exit_ht_cleanup:
 exit:
        /*
         * cleanup() is called when no other thread is running.
This page took 0.027285 seconds and 5 git commands to generate.