iSulad/0037-clean-container-process-after-execSync-timeout-exit.patch
zhangxiaoyu 5ec852595b bugfix for runc and cri
Signed-off-by: zhangxiaoyu <zhangxiaoyu58@huawei.com>
(cherry picked from commit 9c3acba9915c23718ae8a806daa49022a73756eb)
2023-04-25 14:57:24 +08:00

435 lines
14 KiB
Diff

From 53935dc36121b10975e047438ae8135ffa702920 Mon Sep 17 00:00:00 2001
From: zhongtao <zhongtao17@huawei.com>
Date: Tue, 4 Apr 2023 16:30:54 +0800
Subject: [PATCH 37/46] clean container process after execSync timeout exit
Signed-off-by: zhongtao <zhongtao17@huawei.com>
---
src/cmd/isulad-shim/common.c | 30 ++++
src/cmd/isulad-shim/common.h | 11 ++
src/cmd/isulad-shim/main.c | 15 +-
src/cmd/isulad-shim/process.c | 158 +++++++++++++-----
src/cmd/isulad-shim/process.h | 2 +-
.../modules/runtime/isula/isula_rt_ops.c | 40 ++---
6 files changed, 188 insertions(+), 68 deletions(-)
diff --git a/src/cmd/isulad-shim/common.c b/src/cmd/isulad-shim/common.c
index 0c345187..f188da1e 100644
--- a/src/cmd/isulad-shim/common.c
+++ b/src/cmd/isulad-shim/common.c
@@ -26,6 +26,7 @@
#include <sys/stat.h>
#include <stdbool.h>
#include <stdarg.h>
+#include <limits.h>
int set_fd_no_inherited(int fd)
{
@@ -316,3 +317,32 @@ int open_no_inherit(const char *path, int flag, mode_t mode)
return fd;
}
+
+static bool is_invalid_error_str(const char *err_str, const char *numstr)
+{
+ return err_str == NULL || err_str == numstr || *err_str != '\0';
+}
+
+int shim_util_safe_uint64(const char *numstr, uint64_t *converted)
+{
+ char *err_str = NULL;
+ uint64_t ull;
+
+ if (numstr == NULL || converted == NULL) {
+ return -EINVAL;
+ }
+
+ errno = 0;
+ ull = strtoull(numstr, &err_str, 0);
+ if (errno > 0) {
+ return -errno;
+ }
+
+ if (is_invalid_error_str(err_str, numstr)) {
+ return -EINVAL;
+ }
+
+ *converted = (uint64_t)ull;
+ return 0;
+}
+
diff --git a/src/cmd/isulad-shim/common.h b/src/cmd/isulad-shim/common.h
index d06c5256..91808295 100644
--- a/src/cmd/isulad-shim/common.h
+++ b/src/cmd/isulad-shim/common.h
@@ -19,6 +19,7 @@
#include <stdbool.h>
#include <stddef.h>
#include <sys/types.h>
+#include <stdint.h>
#ifdef __cplusplus
extern "C" {
@@ -31,6 +32,14 @@ extern "C" {
#define SHIM_ERR (-1)
#define SHIM_ERR_WAIT (-2)
#define SHIM_ERR_NOT_REQUIRED (-3)
+#define SHIM_ERR_TIMEOUT (-4)
+
+// common exit code is defined in stdlib.h
+// EXIT_FAILURE 1 : Failing exit status.
+// EXIT_SUCCESS 0 : Successful exit status.
+// custom shim exit code
+// SHIM_EXIT_TIMEOUT 2: Container process timeout exit code
+#define SHIM_EXIT_TIMEOUT 2
#define INFO_MSG "info"
#define WARN_MSG "warn"
@@ -68,6 +77,8 @@ void close_fd(int *pfd);
int open_no_inherit(const char *path, int flag, mode_t mode);
+int shim_util_safe_uint64(const char *numstr, uint64_t *converted);
+
#ifdef __cplusplus
}
#endif
diff --git a/src/cmd/isulad-shim/main.c b/src/cmd/isulad-shim/main.c
index eedd8fda..68e99e53 100644
--- a/src/cmd/isulad-shim/main.c
+++ b/src/cmd/isulad-shim/main.c
@@ -62,7 +62,8 @@ static int set_subreaper()
return SHIM_OK;
}
-static int parse_args(int argc, char **argv, char **cid, char **bundle, char **rt_name, char **log_level)
+static int parse_args(int argc, char **argv, char **cid, char **bundle, char **rt_name, char **log_level,
+ uint64_t *timeout)
{
if (argc < 4) {
return SHIM_ERR;
@@ -82,6 +83,12 @@ static int parse_args(int argc, char **argv, char **cid, char **bundle, char **r
}
}
+ if (argc > 5) {
+ if (shim_util_safe_uint64(strdup(argv[5]), timeout) != 0) {
+ return SHIM_ERR;
+ }
+ }
+
return SHIM_OK;
}
@@ -99,6 +106,8 @@ int main(int argc, char **argv)
int efd = -1;
process_t *p = NULL;
pthread_t tid_accept;
+ // execSync timeout
+ uint64_t timeout = 0;
g_log_fd = open_no_inherit(SHIM_LOG_NAME, O_CREAT | O_WRONLY | O_APPEND | O_SYNC, 0640);
if (g_log_fd < 0) {
@@ -117,7 +126,7 @@ int main(int argc, char **argv)
exit(EXIT_FAILURE);
}
- ret = parse_args(argc, argv, &container_id, &bundle, &rt_name, &log_level);
+ ret = parse_args(argc, argv, &container_id, &bundle, &rt_name, &log_level, &timeout);
if (ret != SHIM_OK) {
write_message(g_log_fd, ERR_MSG, "parse args failed:%d", ret);
exit(EXIT_FAILURE);
@@ -167,5 +176,5 @@ int main(int argc, char **argv)
released_timeout_exit();
- return process_signal_handle_routine(p, tid_accept);
+ return process_signal_handle_routine(p, tid_accept, timeout);
}
diff --git a/src/cmd/isulad-shim/process.c b/src/cmd/isulad-shim/process.c
index 5222629c..02609911 100644
--- a/src/cmd/isulad-shim/process.c
+++ b/src/cmd/isulad-shim/process.c
@@ -1213,69 +1213,145 @@ static int try_wait_all_child(void)
return 1;
}
-int process_signal_handle_routine(process_t *p, const pthread_t tid_accept)
+static int waitpid_with_timeout(int ctr_pid, int *status, const int64_t timeout)
{
- int ret = SHIM_ERR;
- bool exit_shim = false;
int nret = 0;
- int i;
- struct timespec ts;
+ time_t start_time = time(NULL);
+ time_t end_time;
+ double interval;
+ int st;
for (;;) {
- int status;
- ret = reap_container(p->ctr_pid, &status);
+ nret = waitpid(-1, &st, WNOHANG);
+ if (nret == ctr_pid) {
+ break;
+ }
+ end_time = time(NULL);
+ interval = difftime(end_time, start_time);
+ if (nret == 0 && interval >= timeout) {
+ return SHIM_ERR_TIMEOUT;
+ }
+ // sleep some time instead to avoid cpu full running and then retry.
+ usleep(1000);
+ }
+
+ if (WIFSIGNALED(st)) {
+ *status = EXIT_SIGNAL_OFFSET + WTERMSIG(st);
+ } else {
+ *status = WEXITSTATUS(st);
+ }
+
+ if (*status == CONTAINER_ACTION_REBOOT) {
+ nret = setenv("CONTAINER_ACTION", "reboot", 1);
+ if (nret != SHIM_OK) {
+ write_message(g_log_fd, WARN_MSG, "set reboot action failed:%d", SHIM_SYS_ERR(errno));
+ }
+ } else if (*status == CONTAINER_ACTION_SHUTDOWN) {
+ nret = setenv("CONTAINER_ACTION", "shutdown", 1);
+ if (nret != SHIM_OK) {
+ write_message(g_log_fd, WARN_MSG, "set shutdown action failed:%d", SHIM_SYS_ERR(errno));
+ }
+ }
+ return SHIM_OK;
+}
+
+/*
+ * If timeout <= 0, blocking wait in reap_container.
+ * If timeout > 0, non-blocking wait pid with timeout.
+ */
+static int wait_container_process_with_timeout(process_t *p, const unsigned int timeout, int *status)
+{
+ int ret = SHIM_ERR;
+
+ if (timeout > 0) {
+ return waitpid_with_timeout(p->ctr_pid, status, timeout);
+ }
+
+ for (;;) {
+ ret = reap_container(p->ctr_pid, status);
if (ret == SHIM_OK) {
- exit_shim = true;
- if (status == CONTAINER_ACTION_REBOOT) {
+ if (*status == CONTAINER_ACTION_REBOOT) {
ret = setenv("CONTAINER_ACTION", "reboot", 1);
if (ret != SHIM_OK) {
write_message(g_log_fd, WARN_MSG, "set reboot action failed:%d", SHIM_SYS_ERR(errno));
}
- } else if (status == CONTAINER_ACTION_SHUTDOWN) {
+ } else if (*status == CONTAINER_ACTION_SHUTDOWN) {
ret = setenv("CONTAINER_ACTION", "shutdown", 1);
if (ret != SHIM_OK) {
write_message(g_log_fd, WARN_MSG, "set shutdown action failed:%d", SHIM_SYS_ERR(errno));
}
}
- } else if (ret == SHIM_ERR_WAIT) {
+ return SHIM_OK;
+ }
+
+ if (ret == SHIM_ERR_WAIT) {
/* avoid thread entering the infinite loop */
usleep(1000);
+ }
+
+ if (ret == SHIM_ERR) {
+ // if the child process is not expected, retry.
continue;
}
- if (exit_shim) {
- process_kill_all(p);
+ }
- // wait atmost 120 seconds
- DO_RETRY_CALL(120, 1000000, nret, try_wait_all_child);
- if (nret != 0) {
- write_message(g_log_fd, ERR_MSG, "Failed to wait all child after 120 seconds");
- }
+}
- process_delete(p);
- if (p->exit_fd > 0) {
- (void)write_nointr(p->exit_fd, &status, sizeof(int));
- }
- // wait for task_console_accept thread termination. In order to make sure that
- // the io_copy connection is established and io_thread is not used by multiple threads.
- if (p->state->terminal) {
- if (clock_gettime(CLOCK_REALTIME, &ts) == -1) {
- write_message(g_log_fd, ERR_MSG, "Failed to get realtime");
- nret = pthread_join(tid_accept, NULL);
- } else {
- // Set the maximum waiting time to 60s to prevent stuck.
- ts.tv_sec += 60;
- nret = pthread_timedjoin_np(tid_accept, NULL, &ts);
- }
+int process_signal_handle_routine(process_t *p, const pthread_t tid_accept, const unsigned int timeout)
+{
+ int i;
+ int nret = 0;
+ int ret = 0;
+ int status = 0;
+ struct timespec ts;
- if (nret != 0) {
- write_message(g_log_fd, ERR_MSG, "Failed to join task_console_accept thread");
- }
- }
+ ret = wait_container_process_with_timeout(p, timeout, &status);
+ if (ret == SHIM_ERR_TIMEOUT) {
+ // kill container process to ensure process_kill_all effective
+ nret = kill(p->ctr_pid, SIGKILL);
+ if (nret < 0 && errno != ESRCH) {
+ write_message(g_log_fd, ERR_MSG, "Can not kill process (pid=%d) with SIGKILL", p->ctr_pid);
+ exit(EXIT_FAILURE);
+ }
+ }
- for (i = 0; i < 3; i++) {
- destroy_io_thread(p, i);
- }
- return status;
+ process_kill_all(p);
+
+ // wait atmost 120 seconds
+ DO_RETRY_CALL(120, 1000000, nret, try_wait_all_child);
+ if (nret != 0) {
+ write_message(g_log_fd, ERR_MSG, "Failed to wait all child after 120 seconds");
+ }
+
+ process_delete(p);
+ if (p->exit_fd > 0) {
+ (void)write_nointr(p->exit_fd, &status, sizeof(int));
+ }
+ // wait for task_console_accept thread termination. In order to make sure that
+ // the io_copy connection is established and io_thread is not used by multiple threads.
+ if (p->state->terminal) {
+ if (clock_gettime(CLOCK_REALTIME, &ts) == -1) {
+ write_message(g_log_fd, ERR_MSG, "Failed to get realtime");
+ nret = pthread_join(tid_accept, NULL);
+ } else {
+ // Set the maximum waiting time to 60s to prevent stuck.
+ ts.tv_sec += 60;
+ nret = pthread_timedjoin_np(tid_accept, NULL, &ts);
}
+
+ if (nret != 0) {
+ write_message(g_log_fd, ERR_MSG, "Failed to join task_console_accept thread");
+ }
+ }
+
+ for (i = 0; i < 3; i++) {
+ destroy_io_thread(p, i);
}
+
+ if (ret == SHIM_ERR_TIMEOUT) {
+ write_message(g_log_fd, INFO_MSG, "Wait %d timeout", p->ctr_pid);
+ exit(SHIM_EXIT_TIMEOUT);
+ }
+ return status;
+
}
diff --git a/src/cmd/isulad-shim/process.h b/src/cmd/isulad-shim/process.h
index 66820f68..7e3259e8 100644
--- a/src/cmd/isulad-shim/process.h
+++ b/src/cmd/isulad-shim/process.h
@@ -97,7 +97,7 @@ process_t* new_process(char *id, char *bundle, char *runtime);
int open_io(process_t *p, pthread_t *tid_accept);
int process_io_init(process_t *p);
int create_process(process_t *p);
-int process_signal_handle_routine(process_t *p, const pthread_t tid_accept);
+int process_signal_handle_routine(process_t *p, const pthread_t tid_accept, const unsigned int timeout);
#ifdef __cplusplus
}
diff --git a/src/daemon/modules/runtime/isula/isula_rt_ops.c b/src/daemon/modules/runtime/isula/isula_rt_ops.c
index e974964a..5a01b8c6 100644
--- a/src/daemon/modules/runtime/isula/isula_rt_ops.c
+++ b/src/daemon/modules/runtime/isula/isula_rt_ops.c
@@ -54,6 +54,7 @@
#define SHIM_LOG_SIZE ((BUFSIZ - 100) / 2)
#define RESIZE_DATA_SIZE 100
#define PID_WAIT_TIME 120
+#define SHIM_EXIT_TIMEOUT 2
// file name formats of cgroup resources json
#define RESOURCE_FNAME_FORMATS "%s/resources.json"
@@ -692,27 +693,6 @@ static int status_to_exit_code(int status)
return exit_code;
}
-static int try_wait_pid(pid_t pid)
-{
- if (waitpid(pid, NULL, WNOHANG) == pid) {
- return 0;
- }
-
- return 1;
-}
-
-static void kill_and_show_err(pid_t pid)
-{
- int nret = 0;
- kill(pid, SIGKILL);
- // wait atmost 0.5 seconds
- DO_RETRY_CALL(5, 100000, nret, try_wait_pid, pid);
- if (nret != 0) {
- WARN("Fail to wait isulad-shim");
- }
- isulad_set_error_message("Exec container error;exec timeout");
-}
-
static int shim_create(bool fg, const char *id, const char *workdir, const char *bundle, const char *runtime_cmd,
int *exit_code, const int64_t timeout)
{
@@ -731,7 +711,14 @@ static int shim_create(bool fg, const char *id, const char *workdir, const char
params[i++] = bundle;
params[i++] = runtime_cmd;
params[i++] = "info";
- params[i++] = "2m0s";
+ // execSync timeout
+ if (timeout > 0) {
+ params[i] = util_int_to_string(timeout);
+ if (params[i] == NULL) {
+ ERROR("Failed to convert execSync timeout %ld to string", timeout);
+ return -1;
+ }
+ }
runtime_exec_param_dump(params);
if (snprintf(fpid, sizeof(fpid), "%s/shim-pid", workdir) < 0) {
@@ -805,7 +792,7 @@ realexec:
goto out;
}
- status = util_waitpid_with_timeout(pid, timeout, kill_and_show_err);
+ status = util_wait_for_pid_status(pid);
if (status < 0) {
ERROR("failed wait shim-parent %d exit %s", pid, strerror(errno));
ret = -1;
@@ -1204,6 +1191,13 @@ int rt_isula_exec(const char *id, const char *runtime, const rt_exec_params_t *p
goto errlog_out;
}
+ if (*exit_code == SHIM_EXIT_TIMEOUT) {
+ ret = -1;
+ isulad_set_error_message("Exec container error;exec timeout");
+ ERROR("isulad-shim %d exit for execing timeout", pid);
+ goto errlog_out;
+ }
+
pid = get_container_process_pid(workdir);
if (pid < 0) {
ERROR("%s: failed get exec process id", workdir);
--
2.25.1