Turn health.h/health.c into a library
[lttng-tools.git] / src / bin / lttng-sessiond / health.c
CommitLineData
44a5e5eb
DG
1/*
2 * Copyright (C) 2012 - David Goulet <dgoulet@efficios.com>
8782cc74 3 * Copyright (C) 2013 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
44a5e5eb
DG
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License, version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 51
16 * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17 */
18
19#define _GNU_SOURCE
20#include <assert.h>
21#include <inttypes.h>
22#include <stdio.h>
23#include <stdlib.h>
8809eec0 24#include <time.h>
44a5e5eb 25
8809eec0 26#include <common/defaults.h>
44a5e5eb 27#include <common/error.h>
67e05644
DG
28#include <common/macros.h>
29#include <common/sessiond-comm/inet.h>
44a5e5eb
DG
30
31#include "health.h"
32
8782cc74
MD
33/*
34 * An application-specific error state for unregistered thread keeps
35 * track of thread errors. A thread reporting a health error, normally
36 * unregisters and quits. This makes the TLS health state not available
37 * to the health_check_state() call so on unregister we update this
38 * global error array so we can keep track of which thread was on error
39 * if the TLS health state has been removed.
40 */
41struct health_app {
42 /* List of health state, for each application thread */
43 struct cds_list_head list;
44 /*
45 * This lock ensures that TLS memory used for the node and its
46 * container structure don't get reclaimed after the TLS owner
47 * thread exits until we have finished using it.
48 */
49 pthread_mutex_t lock;
50 int nr_types;
51 struct timespec time_delta;
52 /* Health flags containing thread type error state */
53 enum health_flags *flags;
8809eec0
MD
54};
55
927ca06a
DG
56/* Define TLS health state. */
57DEFINE_URCU_TLS(struct health_state, health_state);
58
8782cc74
MD
59struct health_app *health_app_create(int nr_types)
60{
61 struct health_app *ha;
927ca06a 62
8782cc74
MD
63 ha = zmalloc(sizeof(*ha));
64 if (!ha) {
65 return NULL;
66 }
67 ha->flags = zmalloc(sizeof(*ha->flags));
68 if (!ha->flags) {
69 goto error_flags;
70 }
71 CDS_INIT_LIST_HEAD(&ha->list);
72 pthread_mutex_init(&ha->lock, NULL);
73 ha->nr_types = nr_types;
74 ha->time_delta.tv_sec = DEFAULT_HEALTH_CHECK_DELTA_S;
75 ha->time_delta.tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS;
76 return ha;
77
78error_flags:
79 free(ha);
80 return NULL;
81}
927ca06a 82
8782cc74
MD
83void health_app_destroy(struct health_app *ha)
84{
85 free(ha->flags);
86 free(ha);
87}
927ca06a
DG
88
89/*
90 * Lock health state global list mutex.
91 */
8782cc74 92static void state_lock(struct health_app *ha)
927ca06a 93{
8782cc74 94 pthread_mutex_lock(&ha->lock);
927ca06a
DG
95}
96
97/*
98 * Unlock health state global list mutex.
99 */
8782cc74 100static void state_unlock(struct health_app *ha)
927ca06a 101{
8782cc74 102 pthread_mutex_unlock(&ha->lock);
927ca06a
DG
103}
104
8809eec0
MD
105/*
106 * Set time difference in res from time_a and time_b.
107 */
108static void time_diff(const struct timespec *time_a,
109 const struct timespec *time_b, struct timespec *res)
110{
111 if (time_a->tv_nsec - time_b->tv_nsec < 0) {
112 res->tv_sec = time_a->tv_sec - time_b->tv_sec - 1;
113 res->tv_nsec = 1000000000L + time_a->tv_sec - time_b->tv_sec;
114 } else {
115 res->tv_sec = time_a->tv_sec - time_b->tv_sec;
931a97e5 116 res->tv_nsec = time_a->tv_nsec - time_b->tv_nsec;
8809eec0
MD
117 }
118}
119
120/*
121 * Return true if time_a - time_b > diff, else false.
122 */
123static int time_diff_gt(const struct timespec *time_a,
124 const struct timespec *time_b, const struct timespec *diff)
125{
126 struct timespec res;
127
128 time_diff(time_a, time_b, &res);
129 time_diff(&res, diff, &res);
130
131 if (res.tv_sec > 0) {
132 return 1;
133 } else if (res.tv_sec == 0 && res.tv_nsec > 0) {
134 return 1;
135 }
136
137 return 0;
138}
139
44a5e5eb 140/*
c89add41 141 * Validate health state. Checks for the error flag or health conditions.
44a5e5eb
DG
142 *
143 * Return 0 if health is bad or else 1.
144 */
8782cc74 145static int validate_state(struct health_app *ha, struct health_state *state)
44a5e5eb 146{
8809eec0 147 int retval = 1, ret;
139ac872 148 unsigned long current, last;
8809eec0 149 struct timespec current_time;
927ca06a 150
c89add41 151 assert(state);
44a5e5eb 152
139ac872 153 last = state->last;
44a5e5eb 154 current = uatomic_read(&state->current);
44a5e5eb 155
8809eec0 156 ret = clock_gettime(CLOCK_MONOTONIC, &current_time);
931a97e5 157 if (ret < 0) {
8809eec0 158 PERROR("Error reading time\n");
139ac872 159 /* error */
8809eec0
MD
160 retval = 0;
161 goto end;
44a5e5eb
DG
162 }
163
8809eec0
MD
164 /*
165 * Thread is in bad health if flag HEALTH_ERROR is set. It is also in bad
166 * health if, after the delta delay has passed, its the progress counter
167 * has not moved and it has NOT been waiting for a poll() call.
168 */
169 if (uatomic_read(&state->flags) & HEALTH_ERROR) {
170 retval = 0;
171 goto end;
172 }
44a5e5eb 173
139ac872 174 /*
8809eec0
MD
175 * Initial condition need to update the last counter and sample time, but
176 * should not check health in this initial case, because we don't know how
177 * much time has passed.
139ac872 178 */
8809eec0
MD
179 if (state->last_time.tv_sec == 0 && state->last_time.tv_nsec == 0) {
180 /* update last counter and last sample time */
181 state->last = current;
182 memcpy(&state->last_time, &current_time, sizeof(current_time));
183 } else {
8782cc74
MD
184 if (time_diff_gt(&current_time, &state->last_time,
185 &ha->time_delta)) {
8809eec0
MD
186 if (current == last && !HEALTH_IS_IN_POLL(current)) {
187 /* error */
188 retval = 0;
189 }
190 /* update last counter and last sample time */
191 state->last = current;
192 memcpy(&state->last_time, &current_time, sizeof(current_time));
c89add41
DG
193
194 /* On error, stop right now and notify caller. */
195 if (retval == 0) {
196 goto end;
197 }
8809eec0
MD
198 }
199 }
200
201end:
77c7c900 202 DBG("Health state current %lu, last %lu, ret %d",
8809eec0 203 current, last, ret);
c89add41
DG
204 return retval;
205}
206
207/*
208 * Check health of a specific health type. Note that if a thread has not yet
209 * initialize its health subsystem or has quit, it's considered in a good
210 * state.
211 *
212 * Return 0 if health is bad or else 1.
213 */
8782cc74 214int health_check_state(struct health_app *ha, int type)
c89add41
DG
215{
216 int retval = 1;
217 struct health_state *state;
218
8782cc74 219 assert(type < ha->nr_types);
c89add41 220
8782cc74 221 state_lock(ha);
c89add41 222
8782cc74 223 cds_list_for_each_entry(state, &ha->list, node) {
c89add41
DG
224 int ret;
225
226 if (state->type != type) {
227 continue;
228 }
229
8782cc74 230 ret = validate_state(ha, state);
c89add41
DG
231 if (!ret) {
232 retval = 0;
233 goto end;
234 }
235 }
236
237 /* Check the global state since some state might not be visible anymore. */
8782cc74 238 if (ha->flags[type] & HEALTH_ERROR) {
c89add41
DG
239 retval = 0;
240 }
241
242end:
8782cc74 243 state_unlock(ha);
139ac872 244
c89add41
DG
245 DBG("Health check for type %d is %s", (int) type,
246 (retval == 0) ? "BAD" : "GOOD");
8809eec0 247 return retval;
44a5e5eb 248}
927ca06a
DG
249
250/*
251 * Init health state.
252 */
8782cc74 253void health_register(struct health_app *ha, int type)
927ca06a 254{
8782cc74 255 assert(type < ha->nr_types);
927ca06a
DG
256
257 /* Init TLS state. */
258 uatomic_set(&URCU_TLS(health_state).last, 0);
259 uatomic_set(&URCU_TLS(health_state).last_time.tv_sec, 0);
260 uatomic_set(&URCU_TLS(health_state).last_time.tv_nsec, 0);
261 uatomic_set(&URCU_TLS(health_state).current, 0);
262 uatomic_set(&URCU_TLS(health_state).flags, 0);
263 uatomic_set(&URCU_TLS(health_state).type, type);
264
265 /* Add it to the global TLS state list. */
8782cc74
MD
266 state_lock(ha);
267 cds_list_add(&URCU_TLS(health_state).node, &ha->list);
268 state_unlock(ha);
927ca06a
DG
269}
270
271/*
272 * Remove node from global list.
273 */
8782cc74 274void health_unregister(struct health_app *ha)
927ca06a 275{
8782cc74 276 state_lock(ha);
927ca06a
DG
277 /*
278 * On error, set the global_error_state since we are about to remove
279 * the node from the global list.
280 */
281 if (uatomic_read(&URCU_TLS(health_state).flags) & HEALTH_ERROR) {
8782cc74 282 uatomic_set(&ha->flags[URCU_TLS(health_state).type],
927ca06a
DG
283 HEALTH_ERROR);
284 }
285 cds_list_del(&URCU_TLS(health_state).node);
8782cc74 286 state_unlock(ha);
927ca06a 287}
67e05644
DG
288
289/*
290 * Initiliazie health check subsytem. This should be called before any health
291 * register occurs.
292 */
8782cc74 293void health_init(struct health_app *ha)
67e05644
DG
294{
295 /*
296 * Get the maximum value between the default delta value and the TCP
297 * timeout with a safety net of the default health check delta.
298 */
8782cc74 299 ha->time_delta.tv_sec = max_t(unsigned long,
67e05644 300 lttcomm_inet_tcp_timeout + DEFAULT_HEALTH_CHECK_DELTA_S,
8782cc74
MD
301 ha->time_delta.tv_sec);
302 DBG("Health check time delta in seconds set to %lu",
303 ha->time_delta.tv_sec);
67e05644 304}
This page took 0.046548 seconds and 5 git commands to generate.