X-Git-Url: http://git.efficios.com/?p=lttng-tools.git;a=blobdiff_plain;f=src%2Fbin%2Flttng-sessiond%2Fhealth.c;h=7e1d4731a71de92702f200c9cbb96359fb8a58ab;hp=7bf41c883de16bbea20745f2e4650a5115597e1e;hb=8782cc7477fae212607b9fd6395a4b2e2d3357ed;hpb=931a97e545601bbeeafc5b94729464a908ea1cd9 diff --git a/src/bin/lttng-sessiond/health.c b/src/bin/lttng-sessiond/health.c index 7bf41c883..7e1d4731a 100644 --- a/src/bin/lttng-sessiond/health.c +++ b/src/bin/lttng-sessiond/health.c @@ -1,5 +1,6 @@ /* * Copyright (C) 2012 - David Goulet + * Copyright (C) 2013 - Mathieu Desnoyers * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License, version 2 only, as @@ -24,14 +25,83 @@ #include #include +#include +#include #include "health.h" -static const struct timespec time_delta = { - .tv_sec = DEFAULT_HEALTH_CHECK_DELTA_S, - .tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS, +/* + * An application-specific error state for unregistered thread keeps + * track of thread errors. A thread reporting a health error, normally + * unregisters and quits. This makes the TLS health state not available + * to the health_check_state() call so on unregister we update this + * global error array so we can keep track of which thread was on error + * if the TLS health state has been removed. + */ +struct health_app { + /* List of health state, for each application thread */ + struct cds_list_head list; + /* + * This lock ensures that TLS memory used for the node and its + * container structure don't get reclaimed after the TLS owner + * thread exits until we have finished using it. + */ + pthread_mutex_t lock; + int nr_types; + struct timespec time_delta; + /* Health flags containing thread type error state */ + enum health_flags *flags; }; +/* Define TLS health state. */ +DEFINE_URCU_TLS(struct health_state, health_state); + +struct health_app *health_app_create(int nr_types) +{ + struct health_app *ha; + + ha = zmalloc(sizeof(*ha)); + if (!ha) { + return NULL; + } + ha->flags = zmalloc(sizeof(*ha->flags)); + if (!ha->flags) { + goto error_flags; + } + CDS_INIT_LIST_HEAD(&ha->list); + pthread_mutex_init(&ha->lock, NULL); + ha->nr_types = nr_types; + ha->time_delta.tv_sec = DEFAULT_HEALTH_CHECK_DELTA_S; + ha->time_delta.tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS; + return ha; + +error_flags: + free(ha); + return NULL; +} + +void health_app_destroy(struct health_app *ha) +{ + free(ha->flags); + free(ha); +} + +/* + * Lock health state global list mutex. + */ +static void state_lock(struct health_app *ha) +{ + pthread_mutex_lock(&ha->lock); +} + +/* + * Unlock health state global list mutex. + */ +static void state_unlock(struct health_app *ha) +{ + pthread_mutex_unlock(&ha->lock); +} + /* * Set time difference in res from time_a and time_b. */ @@ -68,11 +138,11 @@ static int time_diff_gt(const struct timespec *time_a, } /* - * Check health of a specific health state counter. + * Validate health state. Checks for the error flag or health conditions. * * Return 0 if health is bad or else 1. */ -int health_check_state(struct health_state *state) +static int validate_state(struct health_app *ha, struct health_state *state) { int retval = 1, ret; unsigned long current, last; @@ -111,7 +181,8 @@ int health_check_state(struct health_state *state) state->last = current; memcpy(&state->last_time, ¤t_time, sizeof(current_time)); } else { - if (time_diff_gt(¤t_time, &state->last_time, &time_delta)) { + if (time_diff_gt(¤t_time, &state->last_time, + &ha->time_delta)) { if (current == last && !HEALTH_IS_IN_POLL(current)) { /* error */ retval = 0; @@ -119,12 +190,115 @@ int health_check_state(struct health_state *state) /* update last counter and last sample time */ state->last = current; memcpy(&state->last_time, ¤t_time, sizeof(current_time)); + + /* On error, stop right now and notify caller. */ + if (retval == 0) { + goto end; + } } } end: - DBG("Health state current %" PRIu64 ", last %" PRIu64 ", ret %d", + DBG("Health state current %lu, last %lu, ret %d", current, last, ret); + return retval; +} + +/* + * Check health of a specific health type. Note that if a thread has not yet + * initialize its health subsystem or has quit, it's considered in a good + * state. + * + * Return 0 if health is bad or else 1. + */ +int health_check_state(struct health_app *ha, int type) +{ + int retval = 1; + struct health_state *state; + assert(type < ha->nr_types); + + state_lock(ha); + + cds_list_for_each_entry(state, &ha->list, node) { + int ret; + + if (state->type != type) { + continue; + } + + ret = validate_state(ha, state); + if (!ret) { + retval = 0; + goto end; + } + } + + /* Check the global state since some state might not be visible anymore. */ + if (ha->flags[type] & HEALTH_ERROR) { + retval = 0; + } + +end: + state_unlock(ha); + + DBG("Health check for type %d is %s", (int) type, + (retval == 0) ? "BAD" : "GOOD"); return retval; } + +/* + * Init health state. + */ +void health_register(struct health_app *ha, int type) +{ + assert(type < ha->nr_types); + + /* Init TLS state. */ + uatomic_set(&URCU_TLS(health_state).last, 0); + uatomic_set(&URCU_TLS(health_state).last_time.tv_sec, 0); + uatomic_set(&URCU_TLS(health_state).last_time.tv_nsec, 0); + uatomic_set(&URCU_TLS(health_state).current, 0); + uatomic_set(&URCU_TLS(health_state).flags, 0); + uatomic_set(&URCU_TLS(health_state).type, type); + + /* Add it to the global TLS state list. */ + state_lock(ha); + cds_list_add(&URCU_TLS(health_state).node, &ha->list); + state_unlock(ha); +} + +/* + * Remove node from global list. + */ +void health_unregister(struct health_app *ha) +{ + state_lock(ha); + /* + * On error, set the global_error_state since we are about to remove + * the node from the global list. + */ + if (uatomic_read(&URCU_TLS(health_state).flags) & HEALTH_ERROR) { + uatomic_set(&ha->flags[URCU_TLS(health_state).type], + HEALTH_ERROR); + } + cds_list_del(&URCU_TLS(health_state).node); + state_unlock(ha); +} + +/* + * Initiliazie health check subsytem. This should be called before any health + * register occurs. + */ +void health_init(struct health_app *ha) +{ + /* + * Get the maximum value between the default delta value and the TCP + * timeout with a safety net of the default health check delta. + */ + ha->time_delta.tv_sec = max_t(unsigned long, + lttcomm_inet_tcp_timeout + DEFAULT_HEALTH_CHECK_DELTA_S, + ha->time_delta.tv_sec); + DBG("Health check time delta in seconds set to %lu", + ha->time_delta.tv_sec); +}