Implement health check for app registration dispatch
[lttng-tools.git] / src / bin / lttng-sessiond / main.c
index 4470afc5e8dd1cf51428f9b5ce22d5ce2e5c217f..007c722921700e351cfedd6441fc31a16a9683b3 100644 (file)
@@ -161,6 +161,7 @@ static pthread_t client_thread;
 static pthread_t kernel_thread;
 static pthread_t dispatch_thread;
 static pthread_t health_thread;
+static pthread_t ht_cleanup_thread;
 
 /*
  * UST registration command queue. This queue is tied with a futex and uses a N
@@ -1339,7 +1340,7 @@ error:
  */
 static void *thread_dispatch_ust_registration(void *data)
 {
-       int ret;
+       int ret, err = -1;
        struct cds_wfq_node *node;
        struct ust_command *ust_cmd = NULL;
        struct {
@@ -1347,11 +1348,17 @@ static void *thread_dispatch_ust_registration(void *data)
                struct cds_list_head head;
        } *wait_node = NULL, *tmp_wait_node;
 
+       health_register(HEALTH_TYPE_APP_REG_DISPATCH);
+
+       health_code_update();
+
        CDS_LIST_HEAD(wait_queue);
 
        DBG("[thread] Dispatch UST command started");
 
        while (!CMM_LOAD_SHARED(dispatch_thread_exit)) {
+               health_code_update();
+
                /* Atomically prepare the queue futex */
                futex_nto1_prepare(&ust_cmd_queue.futex);
 
@@ -1359,6 +1366,7 @@ static void *thread_dispatch_ust_registration(void *data)
                        struct ust_app *app = NULL;
                        ust_cmd = NULL;
 
+                       health_code_update();
                        /* Dequeue command for registration */
                        node = cds_wfq_dequeue_blocking(&ust_cmd_queue.queue);
                        if (node == NULL) {
@@ -1423,6 +1431,7 @@ static void *thread_dispatch_ust_registration(void *data)
                                 */
                                cds_list_for_each_entry_safe(wait_node, tmp_wait_node,
                                                &wait_queue, head) {
+                                       health_code_update();
                                        if (wait_node->app->pid == ust_cmd->reg_msg.pid) {
                                                wait_node->app->notify_sock = ust_cmd->sock;
                                                cds_list_del(&wait_node->head);
@@ -1509,9 +1518,13 @@ static void *thread_dispatch_ust_registration(void *data)
                        }
                } while (node != NULL);
 
+               health_poll_entry();
                /* Futex wait on queue. Blocking call on futex() */
                futex_nto1_wait(&ust_cmd_queue.futex);
+               health_poll_exit();
        }
+       /* Normal exit, no error */
+       err = 0;
 
 error:
        /* Clean up wait queue. */
@@ -1522,6 +1535,11 @@ error:
        }
 
        DBG("Dispatch thread dying");
+       if (err) {
+               health_error();
+               ERR("Health error occurred in %s", __func__);
+       }
+       health_unregister();
        return NULL;
 }
 
@@ -3225,13 +3243,25 @@ restart:
                case LTTNG_HEALTH_CONSUMER:
                        reply.ret_code = check_consumer_health();
                        break;
+               case LTTNG_HEALTH_HT_CLEANUP:
+                       reply.ret_code = health_check_state(HEALTH_TYPE_HT_CLEANUP);
+                       break;
+               case LTTNG_HEALTH_APP_MANAGE_NOTIFY:
+                       reply.ret_code = health_check_state(HEALTH_TYPE_APP_MANAGE_NOTIFY);
+                       break;
+               case LTTNG_HEALTH_APP_REG_DISPATCH:
+                       reply.ret_code = health_check_state(HEALTH_TYPE_APP_REG_DISPATCH);
+                       break;
                case LTTNG_HEALTH_ALL:
                        reply.ret_code =
                                health_check_state(HEALTH_TYPE_APP_MANAGE) &&
                                health_check_state(HEALTH_TYPE_APP_REG) &&
                                health_check_state(HEALTH_TYPE_CMD) &&
                                health_check_state(HEALTH_TYPE_KERNEL) &&
-                               check_consumer_health();
+                               check_consumer_health() &&
+                               health_check_state(HEALTH_TYPE_HT_CLEANUP) &&
+                               health_check_state(HEALTH_TYPE_APP_MANAGE_NOTIFY) &&
+                               health_check_state(HEALTH_TYPE_APP_REG_DISPATCH);
                        break;
                default:
                        reply.ret_code = LTTNG_ERR_UND;
@@ -4293,6 +4323,11 @@ int main(int argc, char **argv)
                }
        }
 
+       /* Setup the thread ht_cleanup communication pipe. */
+       if (utils_create_pipe_cloexec(ht_cleanup_pipe) < 0) {
+               goto exit;
+       }
+
        /* Setup the thread apps communication pipe. */
        if ((ret = utils_create_pipe_cloexec(apps_cmd_pipe)) < 0) {
                goto exit;
@@ -4331,6 +4366,14 @@ int main(int argc, char **argv)
 
        write_pidfile();
 
+       /* Create thread to manage the client socket */
+       ret = pthread_create(&ht_cleanup_thread, NULL,
+                       thread_ht_cleanup, (void *) NULL);
+       if (ret != 0) {
+               PERROR("pthread_create ht_cleanup");
+               goto exit_ht_cleanup;
+       }
+
        /* Create thread to manage the client socket */
        ret = pthread_create(&health_thread, NULL,
                        thread_manage_health, (void *) NULL);
@@ -4450,6 +4493,12 @@ exit_client:
        }
 
 exit_health:
+       ret = pthread_join(ht_cleanup_thread, &status);
+       if (ret != 0) {
+               PERROR("pthread_join ht cleanup thread");
+               goto error;     /* join error, exit without cleanup */
+       }
+exit_ht_cleanup:
 exit:
        /*
         * cleanup() is called when no other thread is running.
This page took 0.027163 seconds and 5 git commands to generate.