Turn health.h/health.c into a library
[lttng-tools.git] / src / bin / lttng-sessiond / health.c
... / ...
CommitLineData
1/*
2 * Copyright (C) 2012 - David Goulet <dgoulet@efficios.com>
3 * Copyright (C) 2013 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License, version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 51
16 * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17 */
18
19#define _GNU_SOURCE
20#include <assert.h>
21#include <inttypes.h>
22#include <stdio.h>
23#include <stdlib.h>
24#include <time.h>
25
26#include <common/defaults.h>
27#include <common/error.h>
28#include <common/macros.h>
29#include <common/sessiond-comm/inet.h>
30
31#include "health.h"
32
33/*
34 * An application-specific error state for unregistered thread keeps
35 * track of thread errors. A thread reporting a health error, normally
36 * unregisters and quits. This makes the TLS health state not available
37 * to the health_check_state() call so on unregister we update this
38 * global error array so we can keep track of which thread was on error
39 * if the TLS health state has been removed.
40 */
41struct health_app {
42 /* List of health state, for each application thread */
43 struct cds_list_head list;
44 /*
45 * This lock ensures that TLS memory used for the node and its
46 * container structure don't get reclaimed after the TLS owner
47 * thread exits until we have finished using it.
48 */
49 pthread_mutex_t lock;
50 int nr_types;
51 struct timespec time_delta;
52 /* Health flags containing thread type error state */
53 enum health_flags *flags;
54};
55
56/* Define TLS health state. */
57DEFINE_URCU_TLS(struct health_state, health_state);
58
59struct health_app *health_app_create(int nr_types)
60{
61 struct health_app *ha;
62
63 ha = zmalloc(sizeof(*ha));
64 if (!ha) {
65 return NULL;
66 }
67 ha->flags = zmalloc(sizeof(*ha->flags));
68 if (!ha->flags) {
69 goto error_flags;
70 }
71 CDS_INIT_LIST_HEAD(&ha->list);
72 pthread_mutex_init(&ha->lock, NULL);
73 ha->nr_types = nr_types;
74 ha->time_delta.tv_sec = DEFAULT_HEALTH_CHECK_DELTA_S;
75 ha->time_delta.tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS;
76 return ha;
77
78error_flags:
79 free(ha);
80 return NULL;
81}
82
83void health_app_destroy(struct health_app *ha)
84{
85 free(ha->flags);
86 free(ha);
87}
88
89/*
90 * Lock health state global list mutex.
91 */
92static void state_lock(struct health_app *ha)
93{
94 pthread_mutex_lock(&ha->lock);
95}
96
97/*
98 * Unlock health state global list mutex.
99 */
100static void state_unlock(struct health_app *ha)
101{
102 pthread_mutex_unlock(&ha->lock);
103}
104
105/*
106 * Set time difference in res from time_a and time_b.
107 */
108static void time_diff(const struct timespec *time_a,
109 const struct timespec *time_b, struct timespec *res)
110{
111 if (time_a->tv_nsec - time_b->tv_nsec < 0) {
112 res->tv_sec = time_a->tv_sec - time_b->tv_sec - 1;
113 res->tv_nsec = 1000000000L + time_a->tv_sec - time_b->tv_sec;
114 } else {
115 res->tv_sec = time_a->tv_sec - time_b->tv_sec;
116 res->tv_nsec = time_a->tv_nsec - time_b->tv_nsec;
117 }
118}
119
120/*
121 * Return true if time_a - time_b > diff, else false.
122 */
123static int time_diff_gt(const struct timespec *time_a,
124 const struct timespec *time_b, const struct timespec *diff)
125{
126 struct timespec res;
127
128 time_diff(time_a, time_b, &res);
129 time_diff(&res, diff, &res);
130
131 if (res.tv_sec > 0) {
132 return 1;
133 } else if (res.tv_sec == 0 && res.tv_nsec > 0) {
134 return 1;
135 }
136
137 return 0;
138}
139
140/*
141 * Validate health state. Checks for the error flag or health conditions.
142 *
143 * Return 0 if health is bad or else 1.
144 */
145static int validate_state(struct health_app *ha, struct health_state *state)
146{
147 int retval = 1, ret;
148 unsigned long current, last;
149 struct timespec current_time;
150
151 assert(state);
152
153 last = state->last;
154 current = uatomic_read(&state->current);
155
156 ret = clock_gettime(CLOCK_MONOTONIC, &current_time);
157 if (ret < 0) {
158 PERROR("Error reading time\n");
159 /* error */
160 retval = 0;
161 goto end;
162 }
163
164 /*
165 * Thread is in bad health if flag HEALTH_ERROR is set. It is also in bad
166 * health if, after the delta delay has passed, its the progress counter
167 * has not moved and it has NOT been waiting for a poll() call.
168 */
169 if (uatomic_read(&state->flags) & HEALTH_ERROR) {
170 retval = 0;
171 goto end;
172 }
173
174 /*
175 * Initial condition need to update the last counter and sample time, but
176 * should not check health in this initial case, because we don't know how
177 * much time has passed.
178 */
179 if (state->last_time.tv_sec == 0 && state->last_time.tv_nsec == 0) {
180 /* update last counter and last sample time */
181 state->last = current;
182 memcpy(&state->last_time, &current_time, sizeof(current_time));
183 } else {
184 if (time_diff_gt(&current_time, &state->last_time,
185 &ha->time_delta)) {
186 if (current == last && !HEALTH_IS_IN_POLL(current)) {
187 /* error */
188 retval = 0;
189 }
190 /* update last counter and last sample time */
191 state->last = current;
192 memcpy(&state->last_time, &current_time, sizeof(current_time));
193
194 /* On error, stop right now and notify caller. */
195 if (retval == 0) {
196 goto end;
197 }
198 }
199 }
200
201end:
202 DBG("Health state current %lu, last %lu, ret %d",
203 current, last, ret);
204 return retval;
205}
206
207/*
208 * Check health of a specific health type. Note that if a thread has not yet
209 * initialize its health subsystem or has quit, it's considered in a good
210 * state.
211 *
212 * Return 0 if health is bad or else 1.
213 */
214int health_check_state(struct health_app *ha, int type)
215{
216 int retval = 1;
217 struct health_state *state;
218
219 assert(type < ha->nr_types);
220
221 state_lock(ha);
222
223 cds_list_for_each_entry(state, &ha->list, node) {
224 int ret;
225
226 if (state->type != type) {
227 continue;
228 }
229
230 ret = validate_state(ha, state);
231 if (!ret) {
232 retval = 0;
233 goto end;
234 }
235 }
236
237 /* Check the global state since some state might not be visible anymore. */
238 if (ha->flags[type] & HEALTH_ERROR) {
239 retval = 0;
240 }
241
242end:
243 state_unlock(ha);
244
245 DBG("Health check for type %d is %s", (int) type,
246 (retval == 0) ? "BAD" : "GOOD");
247 return retval;
248}
249
250/*
251 * Init health state.
252 */
253void health_register(struct health_app *ha, int type)
254{
255 assert(type < ha->nr_types);
256
257 /* Init TLS state. */
258 uatomic_set(&URCU_TLS(health_state).last, 0);
259 uatomic_set(&URCU_TLS(health_state).last_time.tv_sec, 0);
260 uatomic_set(&URCU_TLS(health_state).last_time.tv_nsec, 0);
261 uatomic_set(&URCU_TLS(health_state).current, 0);
262 uatomic_set(&URCU_TLS(health_state).flags, 0);
263 uatomic_set(&URCU_TLS(health_state).type, type);
264
265 /* Add it to the global TLS state list. */
266 state_lock(ha);
267 cds_list_add(&URCU_TLS(health_state).node, &ha->list);
268 state_unlock(ha);
269}
270
271/*
272 * Remove node from global list.
273 */
274void health_unregister(struct health_app *ha)
275{
276 state_lock(ha);
277 /*
278 * On error, set the global_error_state since we are about to remove
279 * the node from the global list.
280 */
281 if (uatomic_read(&URCU_TLS(health_state).flags) & HEALTH_ERROR) {
282 uatomic_set(&ha->flags[URCU_TLS(health_state).type],
283 HEALTH_ERROR);
284 }
285 cds_list_del(&URCU_TLS(health_state).node);
286 state_unlock(ha);
287}
288
289/*
290 * Initiliazie health check subsytem. This should be called before any health
291 * register occurs.
292 */
293void health_init(struct health_app *ha)
294{
295 /*
296 * Get the maximum value between the default delta value and the TCP
297 * timeout with a safety net of the default health check delta.
298 */
299 ha->time_delta.tv_sec = max_t(unsigned long,
300 lttcomm_inet_tcp_timeout + DEFAULT_HEALTH_CHECK_DELTA_S,
301 ha->time_delta.tv_sec);
302 DBG("Health check time delta in seconds set to %lu",
303 ha->time_delta.tv_sec);
304}
This page took 0.024681 seconds and 5 git commands to generate.