From 53935dc36121b10975e047438ae8135ffa702920 Mon Sep 17 00:00:00 2001 From: zhongtao Date: Tue, 4 Apr 2023 16:30:54 +0800 Subject: [PATCH 37/46] clean container process after execSync timeout exit Signed-off-by: zhongtao --- src/cmd/isulad-shim/common.c | 30 ++++ src/cmd/isulad-shim/common.h | 11 ++ src/cmd/isulad-shim/main.c | 15 +- src/cmd/isulad-shim/process.c | 158 +++++++++++++----- src/cmd/isulad-shim/process.h | 2 +- .../modules/runtime/isula/isula_rt_ops.c | 40 ++--- 6 files changed, 188 insertions(+), 68 deletions(-) diff --git a/src/cmd/isulad-shim/common.c b/src/cmd/isulad-shim/common.c index 0c345187..f188da1e 100644 --- a/src/cmd/isulad-shim/common.c +++ b/src/cmd/isulad-shim/common.c @@ -26,6 +26,7 @@ #include #include #include +#include int set_fd_no_inherited(int fd) { @@ -316,3 +317,32 @@ int open_no_inherit(const char *path, int flag, mode_t mode) return fd; } + +static bool is_invalid_error_str(const char *err_str, const char *numstr) +{ + return err_str == NULL || err_str == numstr || *err_str != '\0'; +} + +int shim_util_safe_uint64(const char *numstr, uint64_t *converted) +{ + char *err_str = NULL; + uint64_t ull; + + if (numstr == NULL || converted == NULL) { + return -EINVAL; + } + + errno = 0; + ull = strtoull(numstr, &err_str, 0); + if (errno > 0) { + return -errno; + } + + if (is_invalid_error_str(err_str, numstr)) { + return -EINVAL; + } + + *converted = (uint64_t)ull; + return 0; +} + diff --git a/src/cmd/isulad-shim/common.h b/src/cmd/isulad-shim/common.h index d06c5256..91808295 100644 --- a/src/cmd/isulad-shim/common.h +++ b/src/cmd/isulad-shim/common.h @@ -19,6 +19,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -31,6 +32,14 @@ extern "C" { #define SHIM_ERR (-1) #define SHIM_ERR_WAIT (-2) #define SHIM_ERR_NOT_REQUIRED (-3) +#define SHIM_ERR_TIMEOUT (-4) + +// common exit code is defined in stdlib.h +// EXIT_FAILURE 1 : Failing exit status. +// EXIT_SUCCESS 0 : Successful exit status. +// custom shim exit code +// SHIM_EXIT_TIMEOUT 2: Container process timeout exit code +#define SHIM_EXIT_TIMEOUT 2 #define INFO_MSG "info" #define WARN_MSG "warn" @@ -68,6 +77,8 @@ void close_fd(int *pfd); int open_no_inherit(const char *path, int flag, mode_t mode); +int shim_util_safe_uint64(const char *numstr, uint64_t *converted); + #ifdef __cplusplus } #endif diff --git a/src/cmd/isulad-shim/main.c b/src/cmd/isulad-shim/main.c index eedd8fda..68e99e53 100644 --- a/src/cmd/isulad-shim/main.c +++ b/src/cmd/isulad-shim/main.c @@ -62,7 +62,8 @@ static int set_subreaper() return SHIM_OK; } -static int parse_args(int argc, char **argv, char **cid, char **bundle, char **rt_name, char **log_level) +static int parse_args(int argc, char **argv, char **cid, char **bundle, char **rt_name, char **log_level, + uint64_t *timeout) { if (argc < 4) { return SHIM_ERR; @@ -82,6 +83,12 @@ static int parse_args(int argc, char **argv, char **cid, char **bundle, char **r } } + if (argc > 5) { + if (shim_util_safe_uint64(strdup(argv[5]), timeout) != 0) { + return SHIM_ERR; + } + } + return SHIM_OK; } @@ -99,6 +106,8 @@ int main(int argc, char **argv) int efd = -1; process_t *p = NULL; pthread_t tid_accept; + // execSync timeout + uint64_t timeout = 0; g_log_fd = open_no_inherit(SHIM_LOG_NAME, O_CREAT | O_WRONLY | O_APPEND | O_SYNC, 0640); if (g_log_fd < 0) { @@ -117,7 +126,7 @@ int main(int argc, char **argv) exit(EXIT_FAILURE); } - ret = parse_args(argc, argv, &container_id, &bundle, &rt_name, &log_level); + ret = parse_args(argc, argv, &container_id, &bundle, &rt_name, &log_level, &timeout); if (ret != SHIM_OK) { write_message(g_log_fd, ERR_MSG, "parse args failed:%d", ret); exit(EXIT_FAILURE); @@ -167,5 +176,5 @@ int main(int argc, char **argv) released_timeout_exit(); - return process_signal_handle_routine(p, tid_accept); + return process_signal_handle_routine(p, tid_accept, timeout); } diff --git a/src/cmd/isulad-shim/process.c b/src/cmd/isulad-shim/process.c index 5222629c..02609911 100644 --- a/src/cmd/isulad-shim/process.c +++ b/src/cmd/isulad-shim/process.c @@ -1213,69 +1213,145 @@ static int try_wait_all_child(void) return 1; } -int process_signal_handle_routine(process_t *p, const pthread_t tid_accept) +static int waitpid_with_timeout(int ctr_pid, int *status, const int64_t timeout) { - int ret = SHIM_ERR; - bool exit_shim = false; int nret = 0; - int i; - struct timespec ts; + time_t start_time = time(NULL); + time_t end_time; + double interval; + int st; for (;;) { - int status; - ret = reap_container(p->ctr_pid, &status); + nret = waitpid(-1, &st, WNOHANG); + if (nret == ctr_pid) { + break; + } + end_time = time(NULL); + interval = difftime(end_time, start_time); + if (nret == 0 && interval >= timeout) { + return SHIM_ERR_TIMEOUT; + } + // sleep some time instead to avoid cpu full running and then retry. + usleep(1000); + } + + if (WIFSIGNALED(st)) { + *status = EXIT_SIGNAL_OFFSET + WTERMSIG(st); + } else { + *status = WEXITSTATUS(st); + } + + if (*status == CONTAINER_ACTION_REBOOT) { + nret = setenv("CONTAINER_ACTION", "reboot", 1); + if (nret != SHIM_OK) { + write_message(g_log_fd, WARN_MSG, "set reboot action failed:%d", SHIM_SYS_ERR(errno)); + } + } else if (*status == CONTAINER_ACTION_SHUTDOWN) { + nret = setenv("CONTAINER_ACTION", "shutdown", 1); + if (nret != SHIM_OK) { + write_message(g_log_fd, WARN_MSG, "set shutdown action failed:%d", SHIM_SYS_ERR(errno)); + } + } + return SHIM_OK; +} + +/* + * If timeout <= 0, blocking wait in reap_container. + * If timeout > 0, non-blocking wait pid with timeout. + */ +static int wait_container_process_with_timeout(process_t *p, const unsigned int timeout, int *status) +{ + int ret = SHIM_ERR; + + if (timeout > 0) { + return waitpid_with_timeout(p->ctr_pid, status, timeout); + } + + for (;;) { + ret = reap_container(p->ctr_pid, status); if (ret == SHIM_OK) { - exit_shim = true; - if (status == CONTAINER_ACTION_REBOOT) { + if (*status == CONTAINER_ACTION_REBOOT) { ret = setenv("CONTAINER_ACTION", "reboot", 1); if (ret != SHIM_OK) { write_message(g_log_fd, WARN_MSG, "set reboot action failed:%d", SHIM_SYS_ERR(errno)); } - } else if (status == CONTAINER_ACTION_SHUTDOWN) { + } else if (*status == CONTAINER_ACTION_SHUTDOWN) { ret = setenv("CONTAINER_ACTION", "shutdown", 1); if (ret != SHIM_OK) { write_message(g_log_fd, WARN_MSG, "set shutdown action failed:%d", SHIM_SYS_ERR(errno)); } } - } else if (ret == SHIM_ERR_WAIT) { + return SHIM_OK; + } + + if (ret == SHIM_ERR_WAIT) { /* avoid thread entering the infinite loop */ usleep(1000); + } + + if (ret == SHIM_ERR) { + // if the child process is not expected, retry. continue; } - if (exit_shim) { - process_kill_all(p); + } - // wait atmost 120 seconds - DO_RETRY_CALL(120, 1000000, nret, try_wait_all_child); - if (nret != 0) { - write_message(g_log_fd, ERR_MSG, "Failed to wait all child after 120 seconds"); - } +} - process_delete(p); - if (p->exit_fd > 0) { - (void)write_nointr(p->exit_fd, &status, sizeof(int)); - } - // wait for task_console_accept thread termination. In order to make sure that - // the io_copy connection is established and io_thread is not used by multiple threads. - if (p->state->terminal) { - if (clock_gettime(CLOCK_REALTIME, &ts) == -1) { - write_message(g_log_fd, ERR_MSG, "Failed to get realtime"); - nret = pthread_join(tid_accept, NULL); - } else { - // Set the maximum waiting time to 60s to prevent stuck. - ts.tv_sec += 60; - nret = pthread_timedjoin_np(tid_accept, NULL, &ts); - } +int process_signal_handle_routine(process_t *p, const pthread_t tid_accept, const unsigned int timeout) +{ + int i; + int nret = 0; + int ret = 0; + int status = 0; + struct timespec ts; - if (nret != 0) { - write_message(g_log_fd, ERR_MSG, "Failed to join task_console_accept thread"); - } - } + ret = wait_container_process_with_timeout(p, timeout, &status); + if (ret == SHIM_ERR_TIMEOUT) { + // kill container process to ensure process_kill_all effective + nret = kill(p->ctr_pid, SIGKILL); + if (nret < 0 && errno != ESRCH) { + write_message(g_log_fd, ERR_MSG, "Can not kill process (pid=%d) with SIGKILL", p->ctr_pid); + exit(EXIT_FAILURE); + } + } - for (i = 0; i < 3; i++) { - destroy_io_thread(p, i); - } - return status; + process_kill_all(p); + + // wait atmost 120 seconds + DO_RETRY_CALL(120, 1000000, nret, try_wait_all_child); + if (nret != 0) { + write_message(g_log_fd, ERR_MSG, "Failed to wait all child after 120 seconds"); + } + + process_delete(p); + if (p->exit_fd > 0) { + (void)write_nointr(p->exit_fd, &status, sizeof(int)); + } + // wait for task_console_accept thread termination. In order to make sure that + // the io_copy connection is established and io_thread is not used by multiple threads. + if (p->state->terminal) { + if (clock_gettime(CLOCK_REALTIME, &ts) == -1) { + write_message(g_log_fd, ERR_MSG, "Failed to get realtime"); + nret = pthread_join(tid_accept, NULL); + } else { + // Set the maximum waiting time to 60s to prevent stuck. + ts.tv_sec += 60; + nret = pthread_timedjoin_np(tid_accept, NULL, &ts); } + + if (nret != 0) { + write_message(g_log_fd, ERR_MSG, "Failed to join task_console_accept thread"); + } + } + + for (i = 0; i < 3; i++) { + destroy_io_thread(p, i); } + + if (ret == SHIM_ERR_TIMEOUT) { + write_message(g_log_fd, INFO_MSG, "Wait %d timeout", p->ctr_pid); + exit(SHIM_EXIT_TIMEOUT); + } + return status; + } diff --git a/src/cmd/isulad-shim/process.h b/src/cmd/isulad-shim/process.h index 66820f68..7e3259e8 100644 --- a/src/cmd/isulad-shim/process.h +++ b/src/cmd/isulad-shim/process.h @@ -97,7 +97,7 @@ process_t* new_process(char *id, char *bundle, char *runtime); int open_io(process_t *p, pthread_t *tid_accept); int process_io_init(process_t *p); int create_process(process_t *p); -int process_signal_handle_routine(process_t *p, const pthread_t tid_accept); +int process_signal_handle_routine(process_t *p, const pthread_t tid_accept, const unsigned int timeout); #ifdef __cplusplus } diff --git a/src/daemon/modules/runtime/isula/isula_rt_ops.c b/src/daemon/modules/runtime/isula/isula_rt_ops.c index e974964a..5a01b8c6 100644 --- a/src/daemon/modules/runtime/isula/isula_rt_ops.c +++ b/src/daemon/modules/runtime/isula/isula_rt_ops.c @@ -54,6 +54,7 @@ #define SHIM_LOG_SIZE ((BUFSIZ - 100) / 2) #define RESIZE_DATA_SIZE 100 #define PID_WAIT_TIME 120 +#define SHIM_EXIT_TIMEOUT 2 // file name formats of cgroup resources json #define RESOURCE_FNAME_FORMATS "%s/resources.json" @@ -692,27 +693,6 @@ static int status_to_exit_code(int status) return exit_code; } -static int try_wait_pid(pid_t pid) -{ - if (waitpid(pid, NULL, WNOHANG) == pid) { - return 0; - } - - return 1; -} - -static void kill_and_show_err(pid_t pid) -{ - int nret = 0; - kill(pid, SIGKILL); - // wait atmost 0.5 seconds - DO_RETRY_CALL(5, 100000, nret, try_wait_pid, pid); - if (nret != 0) { - WARN("Fail to wait isulad-shim"); - } - isulad_set_error_message("Exec container error;exec timeout"); -} - static int shim_create(bool fg, const char *id, const char *workdir, const char *bundle, const char *runtime_cmd, int *exit_code, const int64_t timeout) { @@ -731,7 +711,14 @@ static int shim_create(bool fg, const char *id, const char *workdir, const char params[i++] = bundle; params[i++] = runtime_cmd; params[i++] = "info"; - params[i++] = "2m0s"; + // execSync timeout + if (timeout > 0) { + params[i] = util_int_to_string(timeout); + if (params[i] == NULL) { + ERROR("Failed to convert execSync timeout %ld to string", timeout); + return -1; + } + } runtime_exec_param_dump(params); if (snprintf(fpid, sizeof(fpid), "%s/shim-pid", workdir) < 0) { @@ -805,7 +792,7 @@ realexec: goto out; } - status = util_waitpid_with_timeout(pid, timeout, kill_and_show_err); + status = util_wait_for_pid_status(pid); if (status < 0) { ERROR("failed wait shim-parent %d exit %s", pid, strerror(errno)); ret = -1; @@ -1204,6 +1191,13 @@ int rt_isula_exec(const char *id, const char *runtime, const rt_exec_params_t *p goto errlog_out; } + if (*exit_code == SHIM_EXIT_TIMEOUT) { + ret = -1; + isulad_set_error_message("Exec container error;exec timeout"); + ERROR("isulad-shim %d exit for execing timeout", pid); + goto errlog_out; + } + pid = get_container_process_pid(workdir); if (pid < 0) { ERROR("%s: failed get exec process id", workdir); -- 2.25.1