Fix: support duplicate health type
[lttng-tools.git] / src / bin / lttng-sessiond / health.c
1 /*
2 * Copyright (C) 2012 - David Goulet <dgoulet@efficios.com>
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of the GNU General Public License, version 2 only, as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 51
15 * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
16 */
17
18 #define _GNU_SOURCE
19 #include <assert.h>
20 #include <inttypes.h>
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <time.h>
24
25 #include <common/defaults.h>
26 #include <common/error.h>
27
28 #include "health.h"
29
30 static const struct timespec time_delta = {
31 .tv_sec = DEFAULT_HEALTH_CHECK_DELTA_S,
32 .tv_nsec = DEFAULT_HEALTH_CHECK_DELTA_NS,
33 };
34
35 /* Define TLS health state. */
36 DEFINE_URCU_TLS(struct health_state, health_state);
37
38 /*
39 * It ensures that TLS memory used for the node and its container structure
40 * don't get reclaimed after the TLS owner thread exits until we have finished
41 * using it.
42 */
43 static pthread_mutex_t health_mutex = PTHREAD_MUTEX_INITIALIZER;
44
45 static struct health_tls_state_list health_state_list = {
46 .head = CDS_LIST_HEAD_INIT(health_state_list.head),
47 };
48
49 /*
50 * This keeps track of the error state for unregistered thread. A thread
51 * reporting a health error, normally unregisters and quits. This makes the TLS
52 * health state not available to the health_check_state() call so on unregister
53 * we update this global error array so we can keep track of which thread was
54 * on error if the TLS health state has been removed.
55 */
56 static enum health_flags global_error_state[HEALTH_NUM_TYPE];
57
58 /*
59 * Lock health state global list mutex.
60 */
61 static void state_lock(void)
62 {
63 pthread_mutex_lock(&health_mutex);
64 }
65
66 /*
67 * Unlock health state global list mutex.
68 */
69 static void state_unlock(void)
70 {
71 pthread_mutex_unlock(&health_mutex);
72 }
73
74 /*
75 * Set time difference in res from time_a and time_b.
76 */
77 static void time_diff(const struct timespec *time_a,
78 const struct timespec *time_b, struct timespec *res)
79 {
80 if (time_a->tv_nsec - time_b->tv_nsec < 0) {
81 res->tv_sec = time_a->tv_sec - time_b->tv_sec - 1;
82 res->tv_nsec = 1000000000L + time_a->tv_sec - time_b->tv_sec;
83 } else {
84 res->tv_sec = time_a->tv_sec - time_b->tv_sec;
85 res->tv_nsec = time_a->tv_nsec - time_b->tv_nsec;
86 }
87 }
88
89 /*
90 * Return true if time_a - time_b > diff, else false.
91 */
92 static int time_diff_gt(const struct timespec *time_a,
93 const struct timespec *time_b, const struct timespec *diff)
94 {
95 struct timespec res;
96
97 time_diff(time_a, time_b, &res);
98 time_diff(&res, diff, &res);
99
100 if (res.tv_sec > 0) {
101 return 1;
102 } else if (res.tv_sec == 0 && res.tv_nsec > 0) {
103 return 1;
104 }
105
106 return 0;
107 }
108
109 /*
110 * Validate health state. Checks for the error flag or health conditions.
111 *
112 * Return 0 if health is bad or else 1.
113 */
114 static int validate_state(struct health_state *state)
115 {
116 int retval = 1, ret;
117 unsigned long current, last;
118 struct timespec current_time;
119
120 assert(state);
121
122 last = state->last;
123 current = uatomic_read(&state->current);
124
125 ret = clock_gettime(CLOCK_MONOTONIC, &current_time);
126 if (ret < 0) {
127 PERROR("Error reading time\n");
128 /* error */
129 retval = 0;
130 goto end;
131 }
132
133 /*
134 * Thread is in bad health if flag HEALTH_ERROR is set. It is also in bad
135 * health if, after the delta delay has passed, its the progress counter
136 * has not moved and it has NOT been waiting for a poll() call.
137 */
138 if (uatomic_read(&state->flags) & HEALTH_ERROR) {
139 retval = 0;
140 goto end;
141 }
142
143 /*
144 * Initial condition need to update the last counter and sample time, but
145 * should not check health in this initial case, because we don't know how
146 * much time has passed.
147 */
148 if (state->last_time.tv_sec == 0 && state->last_time.tv_nsec == 0) {
149 /* update last counter and last sample time */
150 state->last = current;
151 memcpy(&state->last_time, &current_time, sizeof(current_time));
152 } else {
153 if (time_diff_gt(&current_time, &state->last_time, &time_delta)) {
154 if (current == last && !HEALTH_IS_IN_POLL(current)) {
155 /* error */
156 retval = 0;
157 }
158 /* update last counter and last sample time */
159 state->last = current;
160 memcpy(&state->last_time, &current_time, sizeof(current_time));
161
162 /* On error, stop right now and notify caller. */
163 if (retval == 0) {
164 goto end;
165 }
166 }
167 }
168
169 end:
170 DBG("Health state current %lu, last %lu, ret %d",
171 current, last, ret);
172 return retval;
173 }
174
175 /*
176 * Check health of a specific health type. Note that if a thread has not yet
177 * initialize its health subsystem or has quit, it's considered in a good
178 * state.
179 *
180 * Return 0 if health is bad or else 1.
181 */
182 int health_check_state(enum health_type type)
183 {
184 int retval = 1;
185 struct health_state *state;
186
187 assert(type < HEALTH_NUM_TYPE);
188
189 state_lock();
190
191 cds_list_for_each_entry(state, &health_state_list.head, node) {
192 int ret;
193
194 if (state->type != type) {
195 continue;
196 }
197
198 ret = validate_state(state);
199 if (!ret) {
200 retval = 0;
201 goto end;
202 }
203 }
204
205 /* Check the global state since some state might not be visible anymore. */
206 if (global_error_state[type] & HEALTH_ERROR) {
207 retval = 0;
208 }
209
210 end:
211 state_unlock();
212
213 DBG("Health check for type %d is %s", (int) type,
214 (retval == 0) ? "BAD" : "GOOD");
215 return retval;
216 }
217
218 /*
219 * Init health state.
220 */
221 void health_register(enum health_type type)
222 {
223 assert(type < HEALTH_NUM_TYPE);
224
225 /* Init TLS state. */
226 uatomic_set(&URCU_TLS(health_state).last, 0);
227 uatomic_set(&URCU_TLS(health_state).last_time.tv_sec, 0);
228 uatomic_set(&URCU_TLS(health_state).last_time.tv_nsec, 0);
229 uatomic_set(&URCU_TLS(health_state).current, 0);
230 uatomic_set(&URCU_TLS(health_state).flags, 0);
231 uatomic_set(&URCU_TLS(health_state).type, type);
232
233 /* Add it to the global TLS state list. */
234 state_lock();
235 cds_list_add(&URCU_TLS(health_state).node, &health_state_list.head);
236 state_unlock();
237 }
238
239 /*
240 * Remove node from global list.
241 */
242 void health_unregister(void)
243 {
244 state_lock();
245 /*
246 * On error, set the global_error_state since we are about to remove
247 * the node from the global list.
248 */
249 if (uatomic_read(&URCU_TLS(health_state).flags) & HEALTH_ERROR) {
250 uatomic_set(&global_error_state[URCU_TLS(health_state).type],
251 HEALTH_ERROR);
252 }
253 cds_list_del(&URCU_TLS(health_state).node);
254 state_unlock();
255 }
This page took 0.036104 seconds and 6 git commands to generate.