X-Git-Url: http://git.efficios.com/?a=blobdiff_plain;f=src%2Fbin%2Flttng-sessiond%2Fmain.c;h=b688a4c09e45f25c3d62bc82b86cc73bcb23fe16;hb=4ce514c43483ba24fd935024da5b7aca681a7e52;hp=f1cd1b6407f4ee6f34cd18d976368825c3d5d7e3;hpb=380e8d6f886dfe6633044abfa77739d1210979a3;p=lttng-tools.git diff --git a/src/bin/lttng-sessiond/main.c b/src/bin/lttng-sessiond/main.c index f1cd1b640..b688a4c09 100644 --- a/src/bin/lttng-sessiond/main.c +++ b/src/bin/lttng-sessiond/main.c @@ -90,7 +90,6 @@ static struct consumer_data kconsumer_data = { .cmd_unix_sock_path = DEFAULT_KCONSUMERD_CMD_SOCK_PATH, .err_sock = -1, .cmd_sock = -1, - .metadata_sock.fd = -1, .pid_mutex = PTHREAD_MUTEX_INITIALIZER, .lock = PTHREAD_MUTEX_INITIALIZER, .cond = PTHREAD_COND_INITIALIZER, @@ -102,7 +101,6 @@ static struct consumer_data ustconsumer64_data = { .cmd_unix_sock_path = DEFAULT_USTCONSUMERD64_CMD_SOCK_PATH, .err_sock = -1, .cmd_sock = -1, - .metadata_sock.fd = -1, .pid_mutex = PTHREAD_MUTEX_INITIALIZER, .lock = PTHREAD_MUTEX_INITIALIZER, .cond = PTHREAD_COND_INITIALIZER, @@ -114,7 +112,6 @@ static struct consumer_data ustconsumer32_data = { .cmd_unix_sock_path = DEFAULT_USTCONSUMERD32_CMD_SOCK_PATH, .err_sock = -1, .cmd_sock = -1, - .metadata_sock.fd = -1, .pid_mutex = PTHREAD_MUTEX_INITIALIZER, .lock = PTHREAD_MUTEX_INITIALIZER, .cond = PTHREAD_COND_INITIALIZER, @@ -390,6 +387,51 @@ static void stop_threads(void) futex_nto1_wake(&ust_cmd_queue.futex); } +/* + * Close every consumer sockets. + */ +static void close_consumer_sockets(void) +{ + int ret; + + if (kconsumer_data.err_sock >= 0) { + ret = close(kconsumer_data.err_sock); + if (ret < 0) { + PERROR("kernel consumer err_sock close"); + } + } + if (ustconsumer32_data.err_sock >= 0) { + ret = close(ustconsumer32_data.err_sock); + if (ret < 0) { + PERROR("UST consumerd32 err_sock close"); + } + } + if (ustconsumer64_data.err_sock >= 0) { + ret = close(ustconsumer64_data.err_sock); + if (ret < 0) { + PERROR("UST consumerd64 err_sock close"); + } + } + if (kconsumer_data.cmd_sock >= 0) { + ret = close(kconsumer_data.cmd_sock); + if (ret < 0) { + PERROR("kernel consumer cmd_sock close"); + } + } + if (ustconsumer32_data.cmd_sock >= 0) { + ret = close(ustconsumer32_data.cmd_sock); + if (ret < 0) { + PERROR("UST consumerd32 cmd_sock close"); + } + } + if (ustconsumer64_data.cmd_sock >= 0) { + ret = close(ustconsumer64_data.cmd_sock); + if (ret < 0) { + PERROR("UST consumerd64 cmd_sock close"); + } + } +} + /* * Cleanup the daemon */ @@ -401,7 +443,10 @@ static void cleanup(void) DBG("Cleaning up"); - /* First thing first, stop all threads */ + /* + * Close the thread quit pipe. It has already done its job, + * since we are now called. + */ utils_close_pipe(thread_quit_pipe); /* @@ -458,6 +503,8 @@ static void cleanup(void) modprobe_remove_lttng_all(); } + close_consumer_sockets(); + /* */ DBG("%c[%d;%dm*** assert failed :-) *** ==> %c[%dm%c[%d;%dm" "Matthew, BEET driven development works!%c[%dm", @@ -628,6 +675,8 @@ static int update_kernel_stream(struct consumer_data *consumer_data, int fd) if (ret < 0) { goto error; } + /* Update the stream global counter */ + ksess->stream_count_global += ret; /* * Have we already sent fds to the consumer? If yes, it means @@ -642,11 +691,12 @@ static int update_kernel_stream(struct consumer_data *consumer_data, int fd) cds_lfht_for_each_entry(ksess->consumer->socks->ht, &iter.iter, socket, node.node) { /* Code flow error */ - assert(socket->fd >= 0); + assert(socket->fd); pthread_mutex_lock(socket->lock); ret = kernel_consumer_send_channel_stream(socket, - channel, ksess); + channel, ksess, + session->output_traces ? 1 : 0); pthread_mutex_unlock(socket->lock); if (ret < 0) { rcu_read_unlock(); @@ -966,15 +1016,15 @@ restart: /* Connect both socket, command and metadata. */ consumer_data->cmd_sock = lttcomm_connect_unix_sock(consumer_data->cmd_unix_sock_path); - consumer_data->metadata_sock.fd = + consumer_data->metadata_fd = lttcomm_connect_unix_sock(consumer_data->cmd_unix_sock_path); - if (consumer_data->cmd_sock < 0 || - consumer_data->metadata_sock.fd < 0) { + if (consumer_data->cmd_sock < 0 || consumer_data->metadata_fd < 0) { PERROR("consumer connect cmd socket"); /* On error, signal condition and quit. */ signal_consumer_condition(consumer_data, -1); goto error; } + consumer_data->metadata_sock.fd = &consumer_data->metadata_fd; /* Create metadata socket lock. */ consumer_data->metadata_sock.lock = zmalloc(sizeof(pthread_mutex_t)); if (consumer_data->metadata_sock.lock == NULL) { @@ -987,7 +1037,7 @@ restart: signal_consumer_condition(consumer_data, 1); DBG("Consumer command socket ready (fd: %d", consumer_data->cmd_sock); DBG("Consumer metadata socket ready (fd: %d)", - consumer_data->metadata_sock.fd); + consumer_data->metadata_fd); } else { ERR("consumer error when waiting for SOCK_READY : %s", lttcomm_get_readable_code(-code)); @@ -1007,7 +1057,7 @@ restart: } /* Add metadata socket that is successfully connected. */ - ret = lttng_poll_add(&events, consumer_data->metadata_sock.fd, + ret = lttng_poll_add(&events, consumer_data->metadata_fd, LPOLLIN | LPOLLRDHUP); if (ret < 0) { goto error; @@ -1066,7 +1116,7 @@ restart_poll: lttcomm_get_readable_code(-code)); goto exit; - } else if (pollfd == consumer_data->metadata_sock.fd) { + } else if (pollfd == consumer_data->metadata_fd) { /* UST metadata requests */ ret = ust_consumer_metadata_request( &consumer_data->metadata_sock); @@ -1101,15 +1151,17 @@ error: if (ret) { PERROR("close"); } + consumer_data->err_sock = -1; } if (consumer_data->cmd_sock >= 0) { ret = close(consumer_data->cmd_sock); if (ret) { PERROR("close"); } + consumer_data->cmd_sock = -1; } - if (consumer_data->metadata_sock.fd >= 0) { - ret = close(consumer_data->metadata_sock.fd); + if (*consumer_data->metadata_sock.fd >= 0) { + ret = close(*consumer_data->metadata_sock.fd); if (ret) { PERROR("close"); } @@ -1243,11 +1295,17 @@ static void *thread_manage_apps(void *data) goto error; } - /* Set socket timeout for both receiving and ending */ + /* + * Set socket timeout for both receiving and ending. + * app_socket_timeout is in seconds, whereas + * lttcomm_setsockopt_rcv_timeout and + * lttcomm_setsockopt_snd_timeout expect msec as + * parameter. + */ (void) lttcomm_setsockopt_rcv_timeout(sock, - app_socket_timeout); + app_socket_timeout * 1000); (void) lttcomm_setsockopt_snd_timeout(sock, - app_socket_timeout); + app_socket_timeout * 1000); DBG("Apps with sock %d added to poll set", sock); @@ -1334,25 +1392,116 @@ error: return ret; } +/* + * Sanitize the wait queue of the dispatch registration thread meaning removing + * invalid nodes from it. This is to avoid memory leaks for the case the UST + * notify socket is never received. + */ +static void sanitize_wait_queue(struct ust_reg_wait_queue *wait_queue) +{ + int ret, nb_fd = 0, i; + unsigned int fd_added = 0; + struct lttng_poll_event events; + struct ust_reg_wait_node *wait_node = NULL, *tmp_wait_node; + + assert(wait_queue); + + lttng_poll_init(&events); + + /* Just skip everything for an empty queue. */ + if (!wait_queue->count) { + goto end; + } + + ret = lttng_poll_create(&events, wait_queue->count, LTTNG_CLOEXEC); + if (ret < 0) { + goto error_create; + } + + cds_list_for_each_entry_safe(wait_node, tmp_wait_node, + &wait_queue->head, head) { + assert(wait_node->app); + ret = lttng_poll_add(&events, wait_node->app->sock, + LPOLLHUP | LPOLLERR); + if (ret < 0) { + goto error; + } + + fd_added = 1; + } + + if (!fd_added) { + goto end; + } + + /* + * Poll but don't block so we can quickly identify the faulty events and + * clean them afterwards from the wait queue. + */ + ret = lttng_poll_wait(&events, 0); + if (ret < 0) { + goto error; + } + nb_fd = ret; + + for (i = 0; i < nb_fd; i++) { + /* Get faulty FD. */ + uint32_t revents = LTTNG_POLL_GETEV(&events, i); + int pollfd = LTTNG_POLL_GETFD(&events, i); + + cds_list_for_each_entry_safe(wait_node, tmp_wait_node, + &wait_queue->head, head) { + if (pollfd == wait_node->app->sock && + (revents & (LPOLLHUP | LPOLLERR))) { + cds_list_del(&wait_node->head); + wait_queue->count--; + ust_app_destroy(wait_node->app); + free(wait_node); + break; + } + } + } + + if (nb_fd > 0) { + DBG("Wait queue sanitized, %d node were cleaned up", nb_fd); + } + +end: + lttng_poll_clean(&events); + return; + +error: + lttng_poll_clean(&events); +error_create: + ERR("Unable to sanitize wait queue"); + return; +} + /* * Dispatch request from the registration threads to the application * communication thread. */ static void *thread_dispatch_ust_registration(void *data) { - int ret; + int ret, err = -1; struct cds_wfq_node *node; struct ust_command *ust_cmd = NULL; - struct { - struct ust_app *app; - struct cds_list_head head; - } *wait_node = NULL, *tmp_wait_node; + struct ust_reg_wait_node *wait_node = NULL, *tmp_wait_node; + struct ust_reg_wait_queue wait_queue = { + .count = 0, + }; - CDS_LIST_HEAD(wait_queue); + health_register(HEALTH_TYPE_APP_REG_DISPATCH); + + health_code_update(); + + CDS_INIT_LIST_HEAD(&wait_queue.head); DBG("[thread] Dispatch UST command started"); while (!CMM_LOAD_SHARED(dispatch_thread_exit)) { + health_code_update(); + /* Atomically prepare the queue futex */ futex_nto1_prepare(&ust_cmd_queue.futex); @@ -1360,6 +1509,14 @@ static void *thread_dispatch_ust_registration(void *data) struct ust_app *app = NULL; ust_cmd = NULL; + /* + * Make sure we don't have node(s) that have hung up before receiving + * the notify socket. This is to clean the list in order to avoid + * memory leaks from notify socket that are never seen. + */ + sanitize_wait_queue(&wait_queue); + + health_code_update(); /* Dequeue command for registration */ node = cds_wfq_dequeue_blocking(&ust_cmd_queue.queue); if (node == NULL) { @@ -1408,7 +1565,8 @@ static void *thread_dispatch_ust_registration(void *data) * Add application to the wait queue so we can set the notify * socket before putting this object in the global ht. */ - cds_list_add(&wait_node->head, &wait_queue); + cds_list_add(&wait_node->head, &wait_queue.head); + wait_queue.count++; free(ust_cmd); /* @@ -1423,10 +1581,12 @@ static void *thread_dispatch_ust_registration(void *data) * notify socket if found. */ cds_list_for_each_entry_safe(wait_node, tmp_wait_node, - &wait_queue, head) { + &wait_queue.head, head) { + health_code_update(); if (wait_node->app->pid == ust_cmd->reg_msg.pid) { wait_node->app->notify_sock = ust_cmd->sock; cds_list_del(&wait_node->head); + wait_queue.count--; app = wait_node->app; free(wait_node); DBG3("UST app notify socket %d is set", ust_cmd->sock); @@ -1510,19 +1670,29 @@ static void *thread_dispatch_ust_registration(void *data) } } while (node != NULL); + health_poll_entry(); /* Futex wait on queue. Blocking call on futex() */ futex_nto1_wait(&ust_cmd_queue.futex); + health_poll_exit(); } + /* Normal exit, no error */ + err = 0; error: /* Clean up wait queue. */ cds_list_for_each_entry_safe(wait_node, tmp_wait_node, - &wait_queue, head) { + &wait_queue.head, head) { cds_list_del(&wait_node->head); + wait_queue.count--; free(wait_node); } DBG("Dispatch thread dying"); + if (err) { + health_error(); + ERR("Health error occurred in %s", __func__); + } + health_unregister(); return NULL; } @@ -2265,6 +2435,8 @@ static int create_ust_session(struct ltt_session *session, lus->uid = session->uid; lus->gid = session->gid; + lus->output_traces = session->output_traces; + lus->snapshot_mode = session->snapshot_mode; session->ust_session = lus; /* Copy session output to the newly created UST session */ @@ -2321,6 +2493,8 @@ static int create_kernel_session(struct ltt_session *session) session->kernel_session->uid = session->uid; session->kernel_session->gid = session->gid; + session->kernel_session->output_traces = session->output_traces; + session->kernel_session->snapshot_mode = session->snapshot_mode; return LTTNG_OK; @@ -2376,12 +2550,17 @@ static int process_client_msg(struct command_ctx *cmd_ctx, int sock, switch (cmd_ctx->lsm->cmd_type) { case LTTNG_CREATE_SESSION: + case LTTNG_CREATE_SESSION_SNAPSHOT: case LTTNG_DESTROY_SESSION: case LTTNG_LIST_SESSIONS: case LTTNG_LIST_DOMAINS: case LTTNG_START_TRACE: case LTTNG_STOP_TRACE: case LTTNG_DATA_PENDING: + case LTTNG_SNAPSHOT_ADD_OUTPUT: + case LTTNG_SNAPSHOT_DEL_OUTPUT: + case LTTNG_SNAPSHOT_LIST_OUTPUT: + case LTTNG_SNAPSHOT_RECORD: need_domain = 0; break; default: @@ -2434,6 +2613,7 @@ static int process_client_msg(struct command_ctx *cmd_ctx, int sock, /* Commands that DO NOT need a session. */ switch (cmd_ctx->lsm->cmd_type) { case LTTNG_CREATE_SESSION: + case LTTNG_CREATE_SESSION_SNAPSHOT: case LTTNG_CALIBRATE: case LTTNG_LIST_SESSIONS: case LTTNG_LIST_TRACEPOINTS: @@ -2450,12 +2630,7 @@ static int process_client_msg(struct command_ctx *cmd_ctx, int sock, session_lock_list(); cmd_ctx->session = session_find_by_name(cmd_ctx->lsm->session.name); if (cmd_ctx->session == NULL) { - if (cmd_ctx->lsm->session.name != NULL) { - ret = LTTNG_ERR_SESS_NOT_FOUND; - } else { - /* If no session name specified */ - ret = LTTNG_ERR_SELECT_SESS; - } + ret = LTTNG_ERR_SESS_NOT_FOUND; goto error; } else { /* Acquire lock for the session */ @@ -2532,6 +2707,10 @@ static int process_client_msg(struct command_ctx *cmd_ctx, int sock, break; case LTTNG_DOMAIN_UST: { + if (!ust_app_supported()) { + ret = LTTNG_ERR_NO_UST; + goto error; + } /* Consumer is in an ERROR state. Report back to client */ if (uatomic_read(&ust_consumerd_state) == CONSUMER_ERROR) { ret = LTTNG_ERR_NO_USTCONSUMERD; @@ -3069,6 +3248,106 @@ skip_domain: ret = cmd_data_pending(cmd_ctx->session); break; } + case LTTNG_SNAPSHOT_ADD_OUTPUT: + { + struct lttcomm_lttng_output_id reply; + + ret = cmd_snapshot_add_output(cmd_ctx->session, + &cmd_ctx->lsm->u.snapshot_output.output, &reply.id); + if (ret != LTTNG_OK) { + goto error; + } + + ret = setup_lttng_msg(cmd_ctx, sizeof(reply)); + if (ret < 0) { + goto setup_error; + } + + /* Copy output list into message payload */ + memcpy(cmd_ctx->llm->payload, &reply, sizeof(reply)); + ret = LTTNG_OK; + break; + } + case LTTNG_SNAPSHOT_DEL_OUTPUT: + { + ret = cmd_snapshot_del_output(cmd_ctx->session, + &cmd_ctx->lsm->u.snapshot_output.output); + break; + } + case LTTNG_SNAPSHOT_LIST_OUTPUT: + { + ssize_t nb_output; + struct lttng_snapshot_output *outputs = NULL; + + nb_output = cmd_snapshot_list_outputs(cmd_ctx->session, &outputs); + if (nb_output < 0) { + ret = -nb_output; + goto error; + } + + ret = setup_lttng_msg(cmd_ctx, + nb_output * sizeof(struct lttng_snapshot_output)); + if (ret < 0) { + free(outputs); + goto setup_error; + } + + if (outputs) { + /* Copy output list into message payload */ + memcpy(cmd_ctx->llm->payload, outputs, + nb_output * sizeof(struct lttng_snapshot_output)); + free(outputs); + } + + ret = LTTNG_OK; + break; + } + case LTTNG_SNAPSHOT_RECORD: + { + ret = cmd_snapshot_record(cmd_ctx->session, + &cmd_ctx->lsm->u.snapshot_record.output, + cmd_ctx->lsm->u.snapshot_record.wait); + break; + } + case LTTNG_CREATE_SESSION_SNAPSHOT: + { + size_t nb_uri, len; + struct lttng_uri *uris = NULL; + + nb_uri = cmd_ctx->lsm->u.uri.size; + len = nb_uri * sizeof(struct lttng_uri); + + if (nb_uri > 0) { + uris = zmalloc(len); + if (uris == NULL) { + ret = LTTNG_ERR_FATAL; + goto error; + } + + /* Receive variable len data */ + DBG("Waiting for %zu URIs from client ...", nb_uri); + ret = lttcomm_recv_unix_sock(sock, uris, len); + if (ret <= 0) { + DBG("No URIs received from client... continuing"); + *sock_error = 1; + ret = LTTNG_ERR_SESSION_FAIL; + free(uris); + goto error; + } + + if (nb_uri == 1 && uris[0].dtype != LTTNG_DST_PATH) { + DBG("Creating session with ONE network URI is a bad call"); + ret = LTTNG_ERR_SESSION_FAIL; + free(uris); + goto error; + } + } + + ret = cmd_create_session_snapshot(cmd_ctx->lsm->session.name, uris, + nb_uri, &cmd_ctx->creds); + free(uris); + break; + } default: ret = LTTNG_ERR_UND; break; @@ -3232,6 +3511,9 @@ restart: case LTTNG_HEALTH_APP_MANAGE_NOTIFY: reply.ret_code = health_check_state(HEALTH_TYPE_APP_MANAGE_NOTIFY); break; + case LTTNG_HEALTH_APP_REG_DISPATCH: + reply.ret_code = health_check_state(HEALTH_TYPE_APP_REG_DISPATCH); + break; case LTTNG_HEALTH_ALL: reply.ret_code = health_check_state(HEALTH_TYPE_APP_MANAGE) && @@ -3240,7 +3522,8 @@ restart: health_check_state(HEALTH_TYPE_KERNEL) && check_consumer_health() && health_check_state(HEALTH_TYPE_HT_CLEANUP) && - health_check_state(HEALTH_TYPE_APP_MANAGE_NOTIFY); + health_check_state(HEALTH_TYPE_APP_MANAGE_NOTIFY) && + health_check_state(HEALTH_TYPE_APP_REG_DISPATCH); break; default: reply.ret_code = LTTNG_ERR_UND; @@ -3793,12 +4076,12 @@ static int set_permissions(char *rundir) ret = allowed_group(); if (ret < 0) { WARN("No tracing group detected"); - ret = 0; - goto end; + /* Setting gid to 0 if no tracing group is found */ + gid = 0; + } else { + gid = ret; } - gid = ret; - /* Set lttng run dir */ ret = chown(rundir, 0, gid); if (ret < 0) { @@ -3806,7 +4089,7 @@ static int set_permissions(char *rundir) PERROR("chown"); } - /* Ensure tracing group can search the run dir */ + /* Ensure all applications and tracing group can search the run dir */ ret = chmod(rundir, S_IRWXU | S_IXGRP | S_IXOTH); if (ret < 0) { ERR("Unable to set permissions on %s", rundir); @@ -3843,7 +4126,6 @@ static int set_permissions(char *rundir) DBG("All permissions are set"); -end: return ret; } @@ -3917,6 +4199,16 @@ static int set_consumer_sockets(struct consumer_data *consumer_data, goto error; } + /* + * Set the CLOEXEC flag. Return code is useless because either way, the + * show must go on. + */ + ret = utils_set_fd_cloexec(consumer_data->err_sock); + if (ret < 0) { + PERROR("utils_set_fd_cloexec"); + /* continue anyway */ + } + /* File permission MUST be 660 */ ret = chmod(consumer_data->err_unix_sock_path, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); @@ -4145,7 +4437,7 @@ int main(int argc, char **argv) DBG2("Kernel consumer cmd path: %s", kconsumer_data.cmd_unix_sock_path); } else { - home_path = get_home_dir(); + home_path = utils_get_home_dir(); if (home_path == NULL) { /* TODO: Add --socket PATH option */ ERR("Can't get HOME directory for sockets creation."); @@ -4345,6 +4637,9 @@ int main(int argc, char **argv) write_pidfile(); + /* Initialize communication library */ + lttcomm_init(); + /* Create thread to manage the client socket */ ret = pthread_create(&ht_cleanup_thread, NULL, thread_ht_cleanup, (void *) NULL);