Server process no longer becomes a zombie on Android

Fixes #21.  The problem was caused when the double-fork logic used in "servald
start" was clobbered in 5103176.  This meant that the servald daemon process on
Android no longer had a PPID=1, but the PID of the long-lived
"org.servalproject" parent process which called the JNI entry point.  Killing
the servald process then caused it to become a zombie process, since the
org.servalproject does not habitually call wait(2).  That caused the "servald
stop" logic to send five SIGHUPs to the zombie without any error, making it
appear that the process was not dying.

Reinstated the double-fork logic and added a new test case to ensure that the
daemon process does not become a zombie on being killed prematurely.
This commit is contained in:
Andrew Bettison
2012-10-08 17:13:17 +10:30
parent 4ecb996909
commit 6954325b04
4 changed files with 114 additions and 56 deletions

View File

@ -281,19 +281,27 @@ int cli_printf(const char *fmt, ...)
int cli_delim(const char *opt)
{
#ifdef HAVE_JNI_H
if (jni_env) {
if (jni_env)
return outv_end_field();
} else
#endif
{
const char *delim = getenv("SERVALD_OUTPUT_DELIMITER");
if (delim == NULL)
delim = opt ? opt : "\n";
fputs(delim, stdout);
}
const char *delim = getenv("SERVALD_OUTPUT_DELIMITER");
if (delim == NULL)
delim = opt ? opt : "\n";
fputs(delim, stdout);
return 0;
}
/* Flush the output fields if they are being written to standard output.
*/
void cli_flush()
{
#ifdef HAVE_JNI_H
if (jni_env)
return;
#endif
fflush(stdout);
}
int app_echo(int argc, const char *const *argv, struct command_line_option *o, void *context)
{
if (debug & DEBUG_VERBOSE) DEBUG_argv("command", argc, argv);
@ -473,7 +481,13 @@ int app_server_start(int argc, const char *const *argv, struct command_line_opti
if (debug & DEBUG_VERBOSE) DEBUG_argv("command", argc, argv);
/* Process optional arguments */
int pid=-1;
int cpid=-1;
#if 0
// It would have been nice if whoever disabled this code had left a comment as to why they didn't
// simply delete it altogether. In any event, this logic is largely redundant because the Android
// Batphone app automatically calls "servald stop" then "servald start" (via JNI) whenever its
// monitor interface socket is broken.
// -- Andrew Bettison <andrew@servalproject.com>
int status=server_probe(&pid);
switch(status) {
case SERVER_NOTRESPONDING:
@ -511,21 +525,17 @@ int app_server_start(int argc, const char *const *argv, struct command_line_opti
return -1;
if (instancepath != NULL)
serval_setinstancepath(instancepath);
if (execpath == NULL) {
#ifdef HAVE_JNI_H
if (jni_env)
return WHY("Must supply <exec path> argument when invoked via JNI");
#endif
if ((tmp = malloc(PATH_MAX)) == NULL)
return WHY("Unable to allocate memory for execpath");
return WHY("Out of memory");
if (get_self_executable_path(tmp, PATH_MAX) == -1)
return WHY("unable to determine own executable name");
execpath = tmp;
}
/* Create the instance directory if it does not yet exist */
if (create_serval_instance_dir() == -1)
return -1;
@ -559,44 +569,56 @@ int app_server_start(int argc, const char *const *argv, struct command_line_opti
overlayMode = 1;
if (foregroundP)
return server(NULL);
switch (fork()) {
const char *dir = getenv("SERVALD_SERVER_CHDIR");
if (!dir)
dir = confValueGet("server.chdir", "/");
switch (cpid = fork()) {
case -1:
/* Main process. Fork failed. There is no child process. */
return WHY_perror("fork");
case 0: {
/* Child process. Close logfile (so that it gets re-opened again on demand, with our
own file pointer), disconnect from current directory, disconnect standard I/O streams,
and start a new process group so that if we are being started by an adb shell session,
then we don't receive a SIGHUP when the adb shell process ends. */
close_logging();
int fd;
if ((fd = open("/dev/null", O_RDWR, 0)) == -1)
_exit(WHY_perror("open"));
if (setsid() == -1)
_exit(WHY_perror("setsid"));
const char *dir = getenv("SERVALD_SERVER_CHDIR");
if (!dir)
dir = confValueGet("server.chdir", "/");
(void)chdir(dir);
(void)dup2(fd, 0);
(void)dup2(fd, 1);
(void)dup2(fd, 2);
if (fd > 2)
(void)close(fd);
/* The execpath option is provided so that a JNI call to "start" can be made which
creates a new server daemon process with the correct argv[0]. Otherwise, the servald
process appears as a process with argv[0] = "org.servalproject". */
if (execpath) {
/* XXX: Need the cast on Solaris because it defins NULL as 0L and gcc doesn't
* see it as a sentinal */
execl(execpath, execpath, "start", "foreground", (void *)NULL);
_exit(-1);
/* Child process. Fork then exit, to disconnect daemon from parent process, so that
when daemon exits it does not live on as a zombie. N.B. Do not return from within this
process; that will unroll the JNI call stack and cause havoc. Use _exit(). */
switch (fork()) {
case -1:
exit(WHY_perror("fork"));
case 0: {
/* Grandchild process. Close logfile (so that it gets re-opened again on demand, with
our own file pointer), disconnect from current directory, disconnect standard I/O
streams, and start a new process session so that if we are being started by an adb
shell session, then we don't receive a SIGHUP when the adb shell process ends. */
close_logging();
int fd;
if ((fd = open("/dev/null", O_RDWR, 0)) == -1)
_exit(WHY_perror("open"));
if (setsid() == -1)
_exit(WHY_perror("setsid"));
(void)chdir(dir);
(void)dup2(fd, 0);
(void)dup2(fd, 1);
(void)dup2(fd, 2);
if (fd > 2)
(void)close(fd);
/* The execpath option is provided so that a JNI call to "start" can be made which
creates a new server daemon process with the correct argv[0]. Otherwise, the servald
process appears as a process with argv[0] = "org.servalproject". */
if (execpath) {
/* Need the cast on Solaris because it defines NULL as 0L and gcc doesn't see it as a
sentinal. */
execl(execpath, execpath, "start", "foreground", (void *)NULL);
_exit(-1);
}
_exit(server(NULL));
// NOT REACHED
}
}
_exit(server(NULL));
// NOT REACHED
_exit(0); // Main process is waitpid()-ing for this.
}
}
/* Main process. Allow a few seconds for the child process to report for duty. */
/* Main process. Wait for the child process to fork the grandchild and exit. */
waitpid(cpid, NULL, 0);
/* Allow a few seconds for the grandchild process to report for duty. */
time_ms_t timeout = gettime_ms() + 5000;
do {
sleep_ms(200); // 5 Hz
@ -615,6 +637,19 @@ int app_server_start(int argc, const char *const *argv, struct command_line_opti
cli_delim(":");
cli_printf("%d", pid);
cli_delim("\n");
cli_flush();
/* Sleep before returning if env var is set. This is used in testing, to simulate the situation
on Android phones where the "start" command is invoked via the JNI interface and the calling
process does not die.
*/
const char *post_sleep = getenv("SERVALD_START_POST_SLEEP");
if (post_sleep) {
time_ms_t milliseconds = atof(post_sleep) * 1000;
if (milliseconds > 0) {
INFOF("Sleeping for %lld milliseconds", (long long) milliseconds);
sleep_ms(milliseconds);
}
}
return ret;
}
@ -624,31 +659,25 @@ int app_server_stop(int argc, const char *const *argv, struct command_line_optio
int pid, tries, running;
const char *instancepath;
time_ms_t timeout;
if (cli_arg(argc, argv, o, "instance path", &instancepath, cli_absolute_path, NULL) == -1)
return WHY("Unable to determine instance path");
if (instancepath != NULL)
serval_setinstancepath(instancepath);
instancepath = serval_instancepath();
cli_puts("instancepath");
cli_delim(":");
cli_puts(instancepath);
cli_delim("\n");
pid = server_pid();
/* Not running, nothing to stop */
if (pid <= 0)
return 1;
INFOF("Stopping server (pid=%d)", pid);
/* Set the stop file and signal the process */
cli_puts("pid");
cli_delim(":");
cli_printf("%d", pid);
cli_delim("\n");
tries = 0;
running = pid;
while (running == pid) {
@ -690,15 +719,11 @@ int app_server_status(int argc, const char *const *argv, struct command_line_opt
if (debug & DEBUG_VERBOSE) DEBUG_argv("command", argc, argv);
int pid;
const char *instancepath;
if (cli_arg(argc, argv, o, "instance path", &instancepath, cli_absolute_path, NULL) == -1)
return WHY("Unable to determine instance path");
if (instancepath != NULL)
serval_setinstancepath(instancepath);
pid = server_pid();
cli_puts("instancepath");
cli_delim(":");
cli_puts(serval_instancepath());
@ -713,7 +738,6 @@ int app_server_status(int argc, const char *const *argv, struct command_line_opt
cli_printf("%d", pid);
cli_delim("\n");
}
return pid > 0 ? 0 : 1;
}

View File

@ -413,6 +413,7 @@ int rhizome_direct_process_mime_line(rhizome_http_request *r,char *buffer,int co
filename[1023]=0;
r->field_file=fopen(filename,"w");
if (!r->field_file) {
WHYF_perror("fopen(%s, \"w\")", alloca_str_toprint(filename));
rhizome_direct_clear_temporary_files(r);
rhizome_server_simple_http_response
(r, 500, "<html><h1>Sorry, couldn't complete your request, reasonable as it was. Perhaps try again later.</h1></html>\r\n");

View File

@ -70,6 +70,8 @@ setup_servald() {
cp -f "$servald_build_executable" $servald
unset SERVALD_OUTPUT_DELIMITER
unset SERVALD_SERVER_START_DELAY
unset SERVALD_SERVER_CHDIR
unset SERVALD_START_POST_SLEEP
unset SERVALD_LOG_FILE
servald_instances_dir="$SERVALD_VAR/instance"
set_instance +Z
@ -211,7 +213,7 @@ start_servald_server() {
local -a after_pids
get_servald_pids before_pids
tfw_log "# before_pids=$before_pids"
SERVALD_SERVER_CHDIR="$instance_dir" SERVALD_LOG_FILE="$instance_servald_log" executeOk --core-backtrace $servald start "$@"
executeOk --core-backtrace servald_start "$@"
extract_stdout_keyvalue start_instance_path instancepath '.*'
extract_stdout_keyvalue start_pid pid '[0-9]\+'
assert [ "$start_instance_path" = "$SERVALINSTANCE_PATH" ]
@ -250,6 +252,12 @@ start_servald_server() {
pop_instance
}
# Utility function:
# - invoke "servald start" command with given args and suitable environment
servald_start() {
SERVALD_SERVER_CHDIR="$instance_dir" SERVALD_LOG_FILE="$instance_servald_log" $servald start "$@"
}
# Utility function:
# - stop a servald server process instance in an orderly fashion
# - cat its log file into the test log
@ -266,6 +274,7 @@ stop_servald_server() {
extract_stdout_keyvalue stop_instance_path instancepath '.*'
assert [ "$stop_instance_path" = "$SERVALINSTANCE_PATH" ]
if [ -n "$servald_pid" ]; then
assertExitStatus '==' 0
extract_stdout_keyvalue stop_pid pid '[0-9]\+'
assert [ "$stop_pid" = "$servald_pid" ]
fi

View File

@ -109,4 +109,28 @@ test_StartStopFast() {
stop_servald_server
}
doc_NoZombie="Server process does not become a zombie"
setup_NoZombie() {
setup
setup_interfaces
export SERVALD_START_POST_SLEEP=1000
servald_start &
start_pid=$!
wait_until get_servald_server_pidfile servald_pid
assert kill -0 $start_pid
}
test_NoZombie() {
tfw_log "Before kill -KILL $servald_pid"
ps -l $start_pid $servald_pid
assert kill -KILL $servald_pid
tfw_log "After kill -KILL $servald_pid"
ps -l $start_pid $servald_pid
assert --message="zombie servald process does not exist" ! kill -0 $servald_pid
}
teardown_NoZombie() {
kill -TERM $start_pid
kill_all_servald_processes
assert_no_servald_processes
}
runTests "$@"