From a01c682b9919f98d45a9127b6e02fd08dcc900bc Mon Sep 17 00:00:00 2001 From: Jonathan Rajotte Date: Wed, 16 Jan 2019 13:38:57 -0500 Subject: [PATCH] Fix: run-as thread deadlock on itself in restart error path MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The deadlock was found using this backtrace Thread 5: 0 __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135 1 0x00007efc6b650023 in __GI___pthread_mutex_lock (mutex=mutex@entry=0x55fc37128400 ) at ../nptl/pthread_mutex_lock.c:78 2 0x000055fc36efbe05 in run_as_destroy_worker () at runas.c:1233 3 0x000055fc36efc2e7 in run_as_restart_worker (worker=) at runas.c:998 4 run_as (cmd=cmd@entry=RUN_AS_UNLINK, data=data@entry=0x7efc5b7fa630, ret_value=ret_value@entry=0x7efc5b7fa510, uid=uid@entry=1000, gid=gid@entry=1000) at runas.c:1033 5 0x000055fc36efc9ce in run_as_unlink (path=path@entry=0x7efc5b7fb690 "/home/joraj/lttng-traces/auto-20190116-111518/20190116T111729-0500-33/kernel/index/channel0_3.idx", uid=uid@entry=1000, gid=gid@entry=1000) at runas.c :1120 6 0x000055fc36ef7feb in utils_unlink_stream_file (path_name=path_name@entry=0x7efc5b7fc7e0 "/home/joraj/lttng-traces/auto-20190116-111518/20190116T111729-0500-33/kernel/index", file_name=file_name@entry=0x7efc500085d4 "channel0_3", size=size@entry=0, count=count@entry=0, uid=uid@entry=1000, gid=gid@entry=1000, suffix=0x55fc36f19b26 ".idx") at utils.c:929 7 0x000055fc36f01d4e in lttng_index_file_create (path_name=path_name@entry=0x7efc500087a0 "/home/joraj/lttng-traces/auto-20190116-111518/20190116T111729-0500-33/kernel", stream_name=stream_name@entry=0x7efc500085d4 "channel0_3", uid=1000, gid=1000, size=0, count=0, major=1, minor=1) at index.c:79 8 0x000055fc36ed9475 in rotate_local_stream (ctx=, stream=0x7efc50008460) at consumer.c:4105 9 0x000055fc36ed98b5 in lttng_consumer_rotate_stream (ctx=ctx@entry=0x55fc37428d80, stream=stream@entry=0x7efc50008460, rotated=rotated@entry=0x7efc5b7fdb27) at consumer.c:4181 10 0x000055fc36ee354e in lttng_kconsumer_read_subbuffer (stream=stream@entry=0x7efc50008460, ctx=ctx@entry=0x55fc37428d80, rotated=rotated@entry=0x7efc5b7fdb27) at kernel-consumer.c:1740 11 0x000055fc36ed7a30 in lttng_consumer_read_subbuffer (stream=0x7efc50008460, ctx=0x55fc37428d80) at consumer.c:3383 12 0x000055fc36ed4b74 in consumer_thread_data_poll (data=0x55fc37428d80) at consumer.c:2751 13 0x00007efc6b64d6db in start_thread (arg=0x7efc5b7fe700) at pthread_create.c:463 14 0x00007efc6af6488f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95 The owner of the lock is itself: print worker_lock.__data.__owner $2 = 25725 thread find 25725 Thread 5 has target id 'Thread 0x7efc5b7fe700 (LWP 25725)' The worker_lock is first taken in frame #4: run_as runas.c:1033 pthread_mutex_lock(&worker_lock); if (use_clone()) { ... /* * If the worker thread crashed the errno is set to EIO. we log * the error and start a new worker process. */ if (ret == -1 && saved_errno == EIO) { DBG("Socket closed unexpectedly... " "Restarting the worker process"); -> ret = run_as_restart_worker(global_worker); if (ret == -1) { ERR("Failed to restart worker process."); goto err; } Solution ======== Create run_as_restart_worker_no_lock which does not to take the lock on execution. Use run_as_restart_worker_no_lock at the run_as error path call site. Use run_as_restart_worker_no_lock inside run_as_restart_worker while holding the worker lock to provide identical behaviour to other call sites. Signed-off-by: Jonathan Rajotte Signed-off-by: Jérémie Galarneau --- src/common/runas.c | 89 ++++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 42 deletions(-) diff --git a/src/common/runas.c b/src/common/runas.c index 84b3513ee..adbbe3e97 100644 --- a/src/common/runas.c +++ b/src/common/runas.c @@ -994,6 +994,51 @@ error_procname_alloc: return ret; } +static +void run_as_destroy_worker_no_lock(void) +{ + struct run_as_worker *worker = global_worker; + + DBG("Destroying run_as worker"); + if (!worker) { + return; + } + /* Close unix socket */ + DBG("Closing run_as worker socket"); + if (lttcomm_close_unix_sock(worker->sockpair[0])) { + PERROR("close"); + } + worker->sockpair[0] = -1; + /* Wait for worker. */ + for (;;) { + int status; + pid_t wait_ret; + + wait_ret = waitpid(worker->pid, &status, 0); + if (wait_ret < 0) { + if (errno == EINTR) { + continue; + } + PERROR("waitpid"); + break; + } + + if (WIFEXITED(status)) { + LOG(WEXITSTATUS(status) == 0 ? PRINT_DBG : PRINT_ERR, + DEFAULT_RUN_AS_WORKER_NAME " terminated with status code %d", + WEXITSTATUS(status)); + break; + } else if (WIFSIGNALED(status)) { + ERR(DEFAULT_RUN_AS_WORKER_NAME " was killed by signal %d", + WTERMSIG(status)); + break; + } + } + free(worker->procname); + free(worker); + global_worker = NULL; +} + static int run_as_restart_worker(struct run_as_worker *worker) { @@ -1003,7 +1048,7 @@ int run_as_restart_worker(struct run_as_worker *worker) procname = worker->procname; /* Close socket to run_as worker process and clean up the zombie process */ - run_as_destroy_worker(); + run_as_destroy_worker_no_lock(); /* Create a new run_as worker process*/ ret = run_as_create_worker_no_lock(procname, NULL, NULL); @@ -1238,47 +1283,7 @@ int run_as_create_worker(const char *procname, LTTNG_HIDDEN void run_as_destroy_worker(void) { - struct run_as_worker *worker = global_worker; - - DBG("Destroying run_as worker"); pthread_mutex_lock(&worker_lock); - if (!worker) { - goto end; - } - /* Close unix socket */ - DBG("Closing run_as worker socket"); - if (lttcomm_close_unix_sock(worker->sockpair[0])) { - PERROR("close"); - } - worker->sockpair[0] = -1; - /* Wait for worker. */ - for (;;) { - int status; - pid_t wait_ret; - - wait_ret = waitpid(worker->pid, &status, 0); - if (wait_ret < 0) { - if (errno == EINTR) { - continue; - } - PERROR("waitpid"); - break; - } - - if (WIFEXITED(status)) { - LOG(WEXITSTATUS(status) == 0 ? PRINT_DBG : PRINT_ERR, - DEFAULT_RUN_AS_WORKER_NAME " terminated with status code %d", - WEXITSTATUS(status)); - break; - } else if (WIFSIGNALED(status)) { - ERR(DEFAULT_RUN_AS_WORKER_NAME " was killed by signal %d", - WTERMSIG(status)); - break; - } - } - free(worker->procname); - free(worker); - global_worker = NULL; -end: + run_as_destroy_worker_no_lock(); pthread_mutex_unlock(&worker_lock); } -- 2.34.1