Signed-off-by: zhangxiaoyu <zhangxiaoyu58@huawei.com> (cherry picked from commit 9c3acba9915c23718ae8a806daa49022a73756eb)
435 lines
14 KiB
Diff
435 lines
14 KiB
Diff
From 53935dc36121b10975e047438ae8135ffa702920 Mon Sep 17 00:00:00 2001
|
|
From: zhongtao <zhongtao17@huawei.com>
|
|
Date: Tue, 4 Apr 2023 16:30:54 +0800
|
|
Subject: [PATCH 37/46] clean container process after execSync timeout exit
|
|
|
|
Signed-off-by: zhongtao <zhongtao17@huawei.com>
|
|
---
|
|
src/cmd/isulad-shim/common.c | 30 ++++
|
|
src/cmd/isulad-shim/common.h | 11 ++
|
|
src/cmd/isulad-shim/main.c | 15 +-
|
|
src/cmd/isulad-shim/process.c | 158 +++++++++++++-----
|
|
src/cmd/isulad-shim/process.h | 2 +-
|
|
.../modules/runtime/isula/isula_rt_ops.c | 40 ++---
|
|
6 files changed, 188 insertions(+), 68 deletions(-)
|
|
|
|
diff --git a/src/cmd/isulad-shim/common.c b/src/cmd/isulad-shim/common.c
|
|
index 0c345187..f188da1e 100644
|
|
--- a/src/cmd/isulad-shim/common.c
|
|
+++ b/src/cmd/isulad-shim/common.c
|
|
@@ -26,6 +26,7 @@
|
|
#include <sys/stat.h>
|
|
#include <stdbool.h>
|
|
#include <stdarg.h>
|
|
+#include <limits.h>
|
|
|
|
int set_fd_no_inherited(int fd)
|
|
{
|
|
@@ -316,3 +317,32 @@ int open_no_inherit(const char *path, int flag, mode_t mode)
|
|
|
|
return fd;
|
|
}
|
|
+
|
|
+static bool is_invalid_error_str(const char *err_str, const char *numstr)
|
|
+{
|
|
+ return err_str == NULL || err_str == numstr || *err_str != '\0';
|
|
+}
|
|
+
|
|
+int shim_util_safe_uint64(const char *numstr, uint64_t *converted)
|
|
+{
|
|
+ char *err_str = NULL;
|
|
+ uint64_t ull;
|
|
+
|
|
+ if (numstr == NULL || converted == NULL) {
|
|
+ return -EINVAL;
|
|
+ }
|
|
+
|
|
+ errno = 0;
|
|
+ ull = strtoull(numstr, &err_str, 0);
|
|
+ if (errno > 0) {
|
|
+ return -errno;
|
|
+ }
|
|
+
|
|
+ if (is_invalid_error_str(err_str, numstr)) {
|
|
+ return -EINVAL;
|
|
+ }
|
|
+
|
|
+ *converted = (uint64_t)ull;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
diff --git a/src/cmd/isulad-shim/common.h b/src/cmd/isulad-shim/common.h
|
|
index d06c5256..91808295 100644
|
|
--- a/src/cmd/isulad-shim/common.h
|
|
+++ b/src/cmd/isulad-shim/common.h
|
|
@@ -19,6 +19,7 @@
|
|
#include <stdbool.h>
|
|
#include <stddef.h>
|
|
#include <sys/types.h>
|
|
+#include <stdint.h>
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
@@ -31,6 +32,14 @@ extern "C" {
|
|
#define SHIM_ERR (-1)
|
|
#define SHIM_ERR_WAIT (-2)
|
|
#define SHIM_ERR_NOT_REQUIRED (-3)
|
|
+#define SHIM_ERR_TIMEOUT (-4)
|
|
+
|
|
+// common exit code is defined in stdlib.h
|
|
+// EXIT_FAILURE 1 : Failing exit status.
|
|
+// EXIT_SUCCESS 0 : Successful exit status.
|
|
+// custom shim exit code
|
|
+// SHIM_EXIT_TIMEOUT 2: Container process timeout exit code
|
|
+#define SHIM_EXIT_TIMEOUT 2
|
|
|
|
#define INFO_MSG "info"
|
|
#define WARN_MSG "warn"
|
|
@@ -68,6 +77,8 @@ void close_fd(int *pfd);
|
|
|
|
int open_no_inherit(const char *path, int flag, mode_t mode);
|
|
|
|
+int shim_util_safe_uint64(const char *numstr, uint64_t *converted);
|
|
+
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
diff --git a/src/cmd/isulad-shim/main.c b/src/cmd/isulad-shim/main.c
|
|
index eedd8fda..68e99e53 100644
|
|
--- a/src/cmd/isulad-shim/main.c
|
|
+++ b/src/cmd/isulad-shim/main.c
|
|
@@ -62,7 +62,8 @@ static int set_subreaper()
|
|
return SHIM_OK;
|
|
}
|
|
|
|
-static int parse_args(int argc, char **argv, char **cid, char **bundle, char **rt_name, char **log_level)
|
|
+static int parse_args(int argc, char **argv, char **cid, char **bundle, char **rt_name, char **log_level,
|
|
+ uint64_t *timeout)
|
|
{
|
|
if (argc < 4) {
|
|
return SHIM_ERR;
|
|
@@ -82,6 +83,12 @@ static int parse_args(int argc, char **argv, char **cid, char **bundle, char **r
|
|
}
|
|
}
|
|
|
|
+ if (argc > 5) {
|
|
+ if (shim_util_safe_uint64(strdup(argv[5]), timeout) != 0) {
|
|
+ return SHIM_ERR;
|
|
+ }
|
|
+ }
|
|
+
|
|
return SHIM_OK;
|
|
}
|
|
|
|
@@ -99,6 +106,8 @@ int main(int argc, char **argv)
|
|
int efd = -1;
|
|
process_t *p = NULL;
|
|
pthread_t tid_accept;
|
|
+ // execSync timeout
|
|
+ uint64_t timeout = 0;
|
|
|
|
g_log_fd = open_no_inherit(SHIM_LOG_NAME, O_CREAT | O_WRONLY | O_APPEND | O_SYNC, 0640);
|
|
if (g_log_fd < 0) {
|
|
@@ -117,7 +126,7 @@ int main(int argc, char **argv)
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
- ret = parse_args(argc, argv, &container_id, &bundle, &rt_name, &log_level);
|
|
+ ret = parse_args(argc, argv, &container_id, &bundle, &rt_name, &log_level, &timeout);
|
|
if (ret != SHIM_OK) {
|
|
write_message(g_log_fd, ERR_MSG, "parse args failed:%d", ret);
|
|
exit(EXIT_FAILURE);
|
|
@@ -167,5 +176,5 @@ int main(int argc, char **argv)
|
|
|
|
released_timeout_exit();
|
|
|
|
- return process_signal_handle_routine(p, tid_accept);
|
|
+ return process_signal_handle_routine(p, tid_accept, timeout);
|
|
}
|
|
diff --git a/src/cmd/isulad-shim/process.c b/src/cmd/isulad-shim/process.c
|
|
index 5222629c..02609911 100644
|
|
--- a/src/cmd/isulad-shim/process.c
|
|
+++ b/src/cmd/isulad-shim/process.c
|
|
@@ -1213,69 +1213,145 @@ static int try_wait_all_child(void)
|
|
return 1;
|
|
}
|
|
|
|
-int process_signal_handle_routine(process_t *p, const pthread_t tid_accept)
|
|
+static int waitpid_with_timeout(int ctr_pid, int *status, const int64_t timeout)
|
|
{
|
|
- int ret = SHIM_ERR;
|
|
- bool exit_shim = false;
|
|
int nret = 0;
|
|
- int i;
|
|
- struct timespec ts;
|
|
+ time_t start_time = time(NULL);
|
|
+ time_t end_time;
|
|
+ double interval;
|
|
+ int st;
|
|
|
|
for (;;) {
|
|
- int status;
|
|
- ret = reap_container(p->ctr_pid, &status);
|
|
+ nret = waitpid(-1, &st, WNOHANG);
|
|
+ if (nret == ctr_pid) {
|
|
+ break;
|
|
+ }
|
|
+ end_time = time(NULL);
|
|
+ interval = difftime(end_time, start_time);
|
|
+ if (nret == 0 && interval >= timeout) {
|
|
+ return SHIM_ERR_TIMEOUT;
|
|
+ }
|
|
+ // sleep some time instead to avoid cpu full running and then retry.
|
|
+ usleep(1000);
|
|
+ }
|
|
+
|
|
+ if (WIFSIGNALED(st)) {
|
|
+ *status = EXIT_SIGNAL_OFFSET + WTERMSIG(st);
|
|
+ } else {
|
|
+ *status = WEXITSTATUS(st);
|
|
+ }
|
|
+
|
|
+ if (*status == CONTAINER_ACTION_REBOOT) {
|
|
+ nret = setenv("CONTAINER_ACTION", "reboot", 1);
|
|
+ if (nret != SHIM_OK) {
|
|
+ write_message(g_log_fd, WARN_MSG, "set reboot action failed:%d", SHIM_SYS_ERR(errno));
|
|
+ }
|
|
+ } else if (*status == CONTAINER_ACTION_SHUTDOWN) {
|
|
+ nret = setenv("CONTAINER_ACTION", "shutdown", 1);
|
|
+ if (nret != SHIM_OK) {
|
|
+ write_message(g_log_fd, WARN_MSG, "set shutdown action failed:%d", SHIM_SYS_ERR(errno));
|
|
+ }
|
|
+ }
|
|
+ return SHIM_OK;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * If timeout <= 0, blocking wait in reap_container.
|
|
+ * If timeout > 0, non-blocking wait pid with timeout.
|
|
+ */
|
|
+static int wait_container_process_with_timeout(process_t *p, const unsigned int timeout, int *status)
|
|
+{
|
|
+ int ret = SHIM_ERR;
|
|
+
|
|
+ if (timeout > 0) {
|
|
+ return waitpid_with_timeout(p->ctr_pid, status, timeout);
|
|
+ }
|
|
+
|
|
+ for (;;) {
|
|
+ ret = reap_container(p->ctr_pid, status);
|
|
if (ret == SHIM_OK) {
|
|
- exit_shim = true;
|
|
- if (status == CONTAINER_ACTION_REBOOT) {
|
|
+ if (*status == CONTAINER_ACTION_REBOOT) {
|
|
ret = setenv("CONTAINER_ACTION", "reboot", 1);
|
|
if (ret != SHIM_OK) {
|
|
write_message(g_log_fd, WARN_MSG, "set reboot action failed:%d", SHIM_SYS_ERR(errno));
|
|
}
|
|
- } else if (status == CONTAINER_ACTION_SHUTDOWN) {
|
|
+ } else if (*status == CONTAINER_ACTION_SHUTDOWN) {
|
|
ret = setenv("CONTAINER_ACTION", "shutdown", 1);
|
|
if (ret != SHIM_OK) {
|
|
write_message(g_log_fd, WARN_MSG, "set shutdown action failed:%d", SHIM_SYS_ERR(errno));
|
|
}
|
|
}
|
|
- } else if (ret == SHIM_ERR_WAIT) {
|
|
+ return SHIM_OK;
|
|
+ }
|
|
+
|
|
+ if (ret == SHIM_ERR_WAIT) {
|
|
/* avoid thread entering the infinite loop */
|
|
usleep(1000);
|
|
+ }
|
|
+
|
|
+ if (ret == SHIM_ERR) {
|
|
+ // if the child process is not expected, retry.
|
|
continue;
|
|
}
|
|
- if (exit_shim) {
|
|
- process_kill_all(p);
|
|
+ }
|
|
|
|
- // wait atmost 120 seconds
|
|
- DO_RETRY_CALL(120, 1000000, nret, try_wait_all_child);
|
|
- if (nret != 0) {
|
|
- write_message(g_log_fd, ERR_MSG, "Failed to wait all child after 120 seconds");
|
|
- }
|
|
+}
|
|
|
|
- process_delete(p);
|
|
- if (p->exit_fd > 0) {
|
|
- (void)write_nointr(p->exit_fd, &status, sizeof(int));
|
|
- }
|
|
- // wait for task_console_accept thread termination. In order to make sure that
|
|
- // the io_copy connection is established and io_thread is not used by multiple threads.
|
|
- if (p->state->terminal) {
|
|
- if (clock_gettime(CLOCK_REALTIME, &ts) == -1) {
|
|
- write_message(g_log_fd, ERR_MSG, "Failed to get realtime");
|
|
- nret = pthread_join(tid_accept, NULL);
|
|
- } else {
|
|
- // Set the maximum waiting time to 60s to prevent stuck.
|
|
- ts.tv_sec += 60;
|
|
- nret = pthread_timedjoin_np(tid_accept, NULL, &ts);
|
|
- }
|
|
+int process_signal_handle_routine(process_t *p, const pthread_t tid_accept, const unsigned int timeout)
|
|
+{
|
|
+ int i;
|
|
+ int nret = 0;
|
|
+ int ret = 0;
|
|
+ int status = 0;
|
|
+ struct timespec ts;
|
|
|
|
- if (nret != 0) {
|
|
- write_message(g_log_fd, ERR_MSG, "Failed to join task_console_accept thread");
|
|
- }
|
|
- }
|
|
+ ret = wait_container_process_with_timeout(p, timeout, &status);
|
|
+ if (ret == SHIM_ERR_TIMEOUT) {
|
|
+ // kill container process to ensure process_kill_all effective
|
|
+ nret = kill(p->ctr_pid, SIGKILL);
|
|
+ if (nret < 0 && errno != ESRCH) {
|
|
+ write_message(g_log_fd, ERR_MSG, "Can not kill process (pid=%d) with SIGKILL", p->ctr_pid);
|
|
+ exit(EXIT_FAILURE);
|
|
+ }
|
|
+ }
|
|
|
|
- for (i = 0; i < 3; i++) {
|
|
- destroy_io_thread(p, i);
|
|
- }
|
|
- return status;
|
|
+ process_kill_all(p);
|
|
+
|
|
+ // wait atmost 120 seconds
|
|
+ DO_RETRY_CALL(120, 1000000, nret, try_wait_all_child);
|
|
+ if (nret != 0) {
|
|
+ write_message(g_log_fd, ERR_MSG, "Failed to wait all child after 120 seconds");
|
|
+ }
|
|
+
|
|
+ process_delete(p);
|
|
+ if (p->exit_fd > 0) {
|
|
+ (void)write_nointr(p->exit_fd, &status, sizeof(int));
|
|
+ }
|
|
+ // wait for task_console_accept thread termination. In order to make sure that
|
|
+ // the io_copy connection is established and io_thread is not used by multiple threads.
|
|
+ if (p->state->terminal) {
|
|
+ if (clock_gettime(CLOCK_REALTIME, &ts) == -1) {
|
|
+ write_message(g_log_fd, ERR_MSG, "Failed to get realtime");
|
|
+ nret = pthread_join(tid_accept, NULL);
|
|
+ } else {
|
|
+ // Set the maximum waiting time to 60s to prevent stuck.
|
|
+ ts.tv_sec += 60;
|
|
+ nret = pthread_timedjoin_np(tid_accept, NULL, &ts);
|
|
}
|
|
+
|
|
+ if (nret != 0) {
|
|
+ write_message(g_log_fd, ERR_MSG, "Failed to join task_console_accept thread");
|
|
+ }
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < 3; i++) {
|
|
+ destroy_io_thread(p, i);
|
|
}
|
|
+
|
|
+ if (ret == SHIM_ERR_TIMEOUT) {
|
|
+ write_message(g_log_fd, INFO_MSG, "Wait %d timeout", p->ctr_pid);
|
|
+ exit(SHIM_EXIT_TIMEOUT);
|
|
+ }
|
|
+ return status;
|
|
+
|
|
}
|
|
diff --git a/src/cmd/isulad-shim/process.h b/src/cmd/isulad-shim/process.h
|
|
index 66820f68..7e3259e8 100644
|
|
--- a/src/cmd/isulad-shim/process.h
|
|
+++ b/src/cmd/isulad-shim/process.h
|
|
@@ -97,7 +97,7 @@ process_t* new_process(char *id, char *bundle, char *runtime);
|
|
int open_io(process_t *p, pthread_t *tid_accept);
|
|
int process_io_init(process_t *p);
|
|
int create_process(process_t *p);
|
|
-int process_signal_handle_routine(process_t *p, const pthread_t tid_accept);
|
|
+int process_signal_handle_routine(process_t *p, const pthread_t tid_accept, const unsigned int timeout);
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
diff --git a/src/daemon/modules/runtime/isula/isula_rt_ops.c b/src/daemon/modules/runtime/isula/isula_rt_ops.c
|
|
index e974964a..5a01b8c6 100644
|
|
--- a/src/daemon/modules/runtime/isula/isula_rt_ops.c
|
|
+++ b/src/daemon/modules/runtime/isula/isula_rt_ops.c
|
|
@@ -54,6 +54,7 @@
|
|
#define SHIM_LOG_SIZE ((BUFSIZ - 100) / 2)
|
|
#define RESIZE_DATA_SIZE 100
|
|
#define PID_WAIT_TIME 120
|
|
+#define SHIM_EXIT_TIMEOUT 2
|
|
|
|
// file name formats of cgroup resources json
|
|
#define RESOURCE_FNAME_FORMATS "%s/resources.json"
|
|
@@ -692,27 +693,6 @@ static int status_to_exit_code(int status)
|
|
return exit_code;
|
|
}
|
|
|
|
-static int try_wait_pid(pid_t pid)
|
|
-{
|
|
- if (waitpid(pid, NULL, WNOHANG) == pid) {
|
|
- return 0;
|
|
- }
|
|
-
|
|
- return 1;
|
|
-}
|
|
-
|
|
-static void kill_and_show_err(pid_t pid)
|
|
-{
|
|
- int nret = 0;
|
|
- kill(pid, SIGKILL);
|
|
- // wait atmost 0.5 seconds
|
|
- DO_RETRY_CALL(5, 100000, nret, try_wait_pid, pid);
|
|
- if (nret != 0) {
|
|
- WARN("Fail to wait isulad-shim");
|
|
- }
|
|
- isulad_set_error_message("Exec container error;exec timeout");
|
|
-}
|
|
-
|
|
static int shim_create(bool fg, const char *id, const char *workdir, const char *bundle, const char *runtime_cmd,
|
|
int *exit_code, const int64_t timeout)
|
|
{
|
|
@@ -731,7 +711,14 @@ static int shim_create(bool fg, const char *id, const char *workdir, const char
|
|
params[i++] = bundle;
|
|
params[i++] = runtime_cmd;
|
|
params[i++] = "info";
|
|
- params[i++] = "2m0s";
|
|
+ // execSync timeout
|
|
+ if (timeout > 0) {
|
|
+ params[i] = util_int_to_string(timeout);
|
|
+ if (params[i] == NULL) {
|
|
+ ERROR("Failed to convert execSync timeout %ld to string", timeout);
|
|
+ return -1;
|
|
+ }
|
|
+ }
|
|
runtime_exec_param_dump(params);
|
|
|
|
if (snprintf(fpid, sizeof(fpid), "%s/shim-pid", workdir) < 0) {
|
|
@@ -805,7 +792,7 @@ realexec:
|
|
goto out;
|
|
}
|
|
|
|
- status = util_waitpid_with_timeout(pid, timeout, kill_and_show_err);
|
|
+ status = util_wait_for_pid_status(pid);
|
|
if (status < 0) {
|
|
ERROR("failed wait shim-parent %d exit %s", pid, strerror(errno));
|
|
ret = -1;
|
|
@@ -1204,6 +1191,13 @@ int rt_isula_exec(const char *id, const char *runtime, const rt_exec_params_t *p
|
|
goto errlog_out;
|
|
}
|
|
|
|
+ if (*exit_code == SHIM_EXIT_TIMEOUT) {
|
|
+ ret = -1;
|
|
+ isulad_set_error_message("Exec container error;exec timeout");
|
|
+ ERROR("isulad-shim %d exit for execing timeout", pid);
|
|
+ goto errlog_out;
|
|
+ }
|
|
+
|
|
pid = get_container_process_pid(workdir);
|
|
if (pid < 0) {
|
|
ERROR("%s: failed get exec process id", workdir);
|
|
--
|
|
2.25.1
|
|
|