Fix: set the health delta tcp timeout aware
[lttng-tools.git] / src / bin / lttng-sessiond / health.c
CommitLineData
44a5e5eb
DG
1/*
2 * Copyright (C) 2012 - David Goulet <dgoulet@efficios.com>
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of the GNU General Public License, version 2 only, as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 51
15 * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
16 */
17
18#define _GNU_SOURCE
19#include <assert.h>
20#include <inttypes.h>
21#include <stdio.h>
22#include <stdlib.h>
8809eec0 23#include <time.h>
44a5e5eb 24
8809eec0 25#include <common/defaults.h>
44a5e5eb 26#include <common/error.h>
67e05644
DG
27#include <common/macros.h>
28#include <common/sessiond-comm/inet.h>
44a5e5eb
DG
29
30#include "health.h"
31
67e05644 32static struct timespec time_delta = {
8809eec0
MD
33 .tv_sec = DEFAULT_HEALTH_CHECK_DELTA_S,
34 .tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS,
35};
36
927ca06a
DG
37/* Define TLS health state. */
38DEFINE_URCU_TLS(struct health_state, health_state);
39
40/*
41 * It ensures that TLS memory used for the node and its container structure
42 * don't get reclaimed after the TLS owner thread exits until we have finished
43 * using it.
44 */
45static pthread_mutex_t health_mutex = PTHREAD_MUTEX_INITIALIZER;
46
47static struct health_tls_state_list health_state_list = {
48 .head = CDS_LIST_HEAD_INIT(health_state_list.head),
49};
50
51/*
52 * This keeps track of the error state for unregistered thread. A thread
53 * reporting a health error, normally unregisters and quits. This makes the TLS
54 * health state not available to the health_check_state() call so on unregister
55 * we update this global error array so we can keep track of which thread was
56 * on error if the TLS health state has been removed.
57 */
58static enum health_flags global_error_state[HEALTH_NUM_TYPE];
59
60/*
61 * Lock health state global list mutex.
62 */
63static void state_lock(void)
64{
65 pthread_mutex_lock(&health_mutex);
66}
67
68/*
69 * Unlock health state global list mutex.
70 */
71static void state_unlock(void)
72{
73 pthread_mutex_unlock(&health_mutex);
74}
75
8809eec0
MD
76/*
77 * Set time difference in res from time_a and time_b.
78 */
79static void time_diff(const struct timespec *time_a,
80 const struct timespec *time_b, struct timespec *res)
81{
82 if (time_a->tv_nsec - time_b->tv_nsec < 0) {
83 res->tv_sec = time_a->tv_sec - time_b->tv_sec - 1;
84 res->tv_nsec = 1000000000L + time_a->tv_sec - time_b->tv_sec;
85 } else {
86 res->tv_sec = time_a->tv_sec - time_b->tv_sec;
931a97e5 87 res->tv_nsec = time_a->tv_nsec - time_b->tv_nsec;
8809eec0
MD
88 }
89}
90
91/*
92 * Return true if time_a - time_b > diff, else false.
93 */
94static int time_diff_gt(const struct timespec *time_a,
95 const struct timespec *time_b, const struct timespec *diff)
96{
97 struct timespec res;
98
99 time_diff(time_a, time_b, &res);
100 time_diff(&res, diff, &res);
101
102 if (res.tv_sec > 0) {
103 return 1;
104 } else if (res.tv_sec == 0 && res.tv_nsec > 0) {
105 return 1;
106 }
107
108 return 0;
109}
110
44a5e5eb 111/*
c89add41 112 * Validate health state. Checks for the error flag or health conditions.
44a5e5eb
DG
113 *
114 * Return 0 if health is bad or else 1.
115 */
c89add41 116static int validate_state(struct health_state *state)
44a5e5eb 117{
8809eec0 118 int retval = 1, ret;
139ac872 119 unsigned long current, last;
8809eec0 120 struct timespec current_time;
927ca06a 121
c89add41 122 assert(state);
44a5e5eb 123
139ac872 124 last = state->last;
44a5e5eb 125 current = uatomic_read(&state->current);
44a5e5eb 126
8809eec0 127 ret = clock_gettime(CLOCK_MONOTONIC, &current_time);
931a97e5 128 if (ret < 0) {
8809eec0 129 PERROR("Error reading time\n");
139ac872 130 /* error */
8809eec0
MD
131 retval = 0;
132 goto end;
44a5e5eb
DG
133 }
134
8809eec0
MD
135 /*
136 * Thread is in bad health if flag HEALTH_ERROR is set. It is also in bad
137 * health if, after the delta delay has passed, its the progress counter
138 * has not moved and it has NOT been waiting for a poll() call.
139 */
140 if (uatomic_read(&state->flags) & HEALTH_ERROR) {
141 retval = 0;
142 goto end;
143 }
44a5e5eb 144
139ac872 145 /*
8809eec0
MD
146 * Initial condition need to update the last counter and sample time, but
147 * should not check health in this initial case, because we don't know how
148 * much time has passed.
139ac872 149 */
8809eec0
MD
150 if (state->last_time.tv_sec == 0 && state->last_time.tv_nsec == 0) {
151 /* update last counter and last sample time */
152 state->last = current;
153 memcpy(&state->last_time, &current_time, sizeof(current_time));
154 } else {
155 if (time_diff_gt(&current_time, &state->last_time, &time_delta)) {
156 if (current == last && !HEALTH_IS_IN_POLL(current)) {
157 /* error */
158 retval = 0;
159 }
160 /* update last counter and last sample time */
161 state->last = current;
162 memcpy(&state->last_time, &current_time, sizeof(current_time));
c89add41
DG
163
164 /* On error, stop right now and notify caller. */
165 if (retval == 0) {
166 goto end;
167 }
8809eec0
MD
168 }
169 }
170
171end:
77c7c900 172 DBG("Health state current %lu, last %lu, ret %d",
8809eec0 173 current, last, ret);
c89add41
DG
174 return retval;
175}
176
177/*
178 * Check health of a specific health type. Note that if a thread has not yet
179 * initialize its health subsystem or has quit, it's considered in a good
180 * state.
181 *
182 * Return 0 if health is bad or else 1.
183 */
184int health_check_state(enum health_type type)
185{
186 int retval = 1;
187 struct health_state *state;
188
189 assert(type < HEALTH_NUM_TYPE);
190
191 state_lock();
192
193 cds_list_for_each_entry(state, &health_state_list.head, node) {
194 int ret;
195
196 if (state->type != type) {
197 continue;
198 }
199
200 ret = validate_state(state);
201 if (!ret) {
202 retval = 0;
203 goto end;
204 }
205 }
206
207 /* Check the global state since some state might not be visible anymore. */
208 if (global_error_state[type] & HEALTH_ERROR) {
209 retval = 0;
210 }
211
212end:
927ca06a 213 state_unlock();
139ac872 214
c89add41
DG
215 DBG("Health check for type %d is %s", (int) type,
216 (retval == 0) ? "BAD" : "GOOD");
8809eec0 217 return retval;
44a5e5eb 218}
927ca06a
DG
219
220/*
221 * Init health state.
222 */
223void health_register(enum health_type type)
224{
927ca06a
DG
225 assert(type < HEALTH_NUM_TYPE);
226
227 /* Init TLS state. */
228 uatomic_set(&URCU_TLS(health_state).last, 0);
229 uatomic_set(&URCU_TLS(health_state).last_time.tv_sec, 0);
230 uatomic_set(&URCU_TLS(health_state).last_time.tv_nsec, 0);
231 uatomic_set(&URCU_TLS(health_state).current, 0);
232 uatomic_set(&URCU_TLS(health_state).flags, 0);
233 uatomic_set(&URCU_TLS(health_state).type, type);
234
235 /* Add it to the global TLS state list. */
236 state_lock();
927ca06a
DG
237 cds_list_add(&URCU_TLS(health_state).node, &health_state_list.head);
238 state_unlock();
239}
240
241/*
242 * Remove node from global list.
243 */
244void health_unregister(void)
245{
246 state_lock();
247 /*
248 * On error, set the global_error_state since we are about to remove
249 * the node from the global list.
250 */
251 if (uatomic_read(&URCU_TLS(health_state).flags) & HEALTH_ERROR) {
252 uatomic_set(&global_error_state[URCU_TLS(health_state).type],
253 HEALTH_ERROR);
254 }
255 cds_list_del(&URCU_TLS(health_state).node);
256 state_unlock();
257}
67e05644
DG
258
259/*
260 * Initiliazie health check subsytem. This should be called before any health
261 * register occurs.
262 */
263void health_init(void)
264{
265 /*
266 * Get the maximum value between the default delta value and the TCP
267 * timeout with a safety net of the default health check delta.
268 */
269 time_delta.tv_sec = max_t(unsigned long,
270 lttcomm_inet_tcp_timeout + DEFAULT_HEALTH_CHECK_DELTA_S,
271 time_delta.tv_sec);
272 DBG("Health check time delta in seconds set to %lu", time_delta.tv_sec);
273}
This page took 0.044377 seconds and 5 git commands to generate.