From d93da1875ed7f1a6061cffb13475506d73c86003 Mon Sep 17 00:00:00 2001 From: jiangpengfei Date: Sat, 25 Jul 2020 16:04:19 +0800 Subject: [PATCH 07/50] kata-runtime: enhance reliability when kata related process reason: enhance the reliability when kata related processes is abnormal, make kata-container still destroy the sandbox and clean up all resources. Signed-off-by: jiangpengfei --- cli/delete.go | 6 ++ cli/kill.go | 3 +- virtcontainers/acrn.go | 2 +- virtcontainers/agent.go | 3 + virtcontainers/api.go | 67 +++++++++++++++++++++ virtcontainers/clh.go | 2 +- virtcontainers/container.go | 55 +++++++++++++++-- virtcontainers/fc.go | 2 +- virtcontainers/hypervisor.go | 2 +- virtcontainers/kata_agent.go | 31 ++++++---- virtcontainers/mock_hypervisor.go | 2 +- virtcontainers/mock_hypervisor_test.go | 2 +- virtcontainers/noop_agent.go | 4 ++ virtcontainers/pkg/oci/utils.go | 5 ++ virtcontainers/qemu.go | 51 +++++++++------- virtcontainers/sandbox.go | 106 +++++++++++++++++++++++++++++---- virtcontainers/types/sandbox.go | 14 ++++- virtcontainers/utils/utils.go | 46 ++++++++++++++ virtcontainers/vm.go | 4 +- 19 files changed, 348 insertions(+), 59 deletions(-) diff --git a/cli/delete.go b/cli/delete.go index c2ce52a4..2f5586e5 100644 --- a/cli/delete.go +++ b/cli/delete.go @@ -110,6 +110,12 @@ func delete(ctx context.Context, containerID string, force bool) error { forceStop = true } + if oci.StateToOCIState(status.State.State) == oci.StateUnhealthy { + // Set forceStop and force bool flag to true to force delete everything + forceStop = true + force = true + } + switch containerType { case vc.PodSandbox: if err := deleteSandbox(ctx, sandboxID, force); err != nil { diff --git a/cli/kill.go b/cli/kill.go index 60fa41e0..b228205f 100644 --- a/cli/kill.go +++ b/cli/kill.go @@ -133,11 +133,12 @@ func kill(ctx context.Context, containerID, signal string, all bool) error { kataLog.WithField("signal", signal).WithField("container state", status.State.State).Info("kill") // container MUST be created, running or paused + // If container state is unhealthy, should process this exceptional case separately if status.State.State == types.StateReady || status.State.State == types.StateRunning || status.State.State == types.StatePaused { if err := vci.KillContainer(ctx, sandboxID, containerID, signum, all); err != nil { return err } - } else if !all { + } else if !all && status.State.State != types.StateUnhealthy { return fmt.Errorf("container not running") } diff --git a/virtcontainers/acrn.go b/virtcontainers/acrn.go index 761eda03..10cae06f 100644 --- a/virtcontainers/acrn.go +++ b/virtcontainers/acrn.go @@ -475,7 +475,7 @@ func (a *Acrn) waitSandbox(timeoutSecs int) error { } // stopSandbox will stop the Sandbox's VM. -func (a *Acrn) stopSandbox() (err error) { +func (a *Acrn) stopSandbox(force bool) (err error) { span, _ := a.trace("stopSandbox") defer span.Finish() diff --git a/virtcontainers/agent.go b/virtcontainers/agent.go index c62107ec..be9526c7 100644 --- a/virtcontainers/agent.go +++ b/virtcontainers/agent.go @@ -259,4 +259,7 @@ type agent interface { // load data from disk load(persistapi.AgentState) + + // get proxy process pid + getProxyPid() int } diff --git a/virtcontainers/api.go b/virtcontainers/api.go index de569713..fa82d163 100644 --- a/virtcontainers/api.go +++ b/virtcontainers/api.go @@ -7,8 +7,10 @@ package virtcontainers import ( "context" + "fmt" "os" "runtime" + "strings" "syscall" deviceApi "github.com/kata-containers/runtime/virtcontainers/device/api" @@ -18,6 +20,7 @@ import ( vcTypes "github.com/kata-containers/runtime/virtcontainers/pkg/types" "github.com/kata-containers/runtime/virtcontainers/store" "github.com/kata-containers/runtime/virtcontainers/types" + "github.com/kata-containers/runtime/virtcontainers/utils" specs "github.com/opencontainers/runtime-spec/specs-go" opentracing "github.com/opentracing/opentracing-go" "github.com/sirupsen/logrus" @@ -597,20 +600,51 @@ func statusContainer(sandbox *Sandbox, containerID string) (ContainerStatus, err container.state.State == types.StatePaused) && container.process.Pid > 0 { + // If container state is active, however kata-proxy and qemu process all exit already + // which means sandbox has beed stopped exceptionally, then we should force delete + // sandbox and container state files in sandbox.Store + if sandbox.shouldForceDelete() { + virtLog.Logger.Warn("sandbox status is abnormal, sandbox should be force deleted") + sandbox.forceDeleteSandbox() + return ContainerStatus{}, fmt.Errorf("sandbox has beed stopped exceptionally") + } + running, err := isShimRunning(container.process.Pid) if err != nil { return ContainerStatus{}, err } + // If kata-shim process exit or be killed, need to stop the container if !running { virtLog.WithFields(logrus.Fields{ "state": container.state.State, "pid": container.process.Pid}). Info("container isn't running") + if err := container.stop(true); err != nil { return ContainerStatus{}, err } } + + isPodSandbox := (containerID == sandbox.id) + + // If sandbox is unhealthy, process it correctly + if !sandbox.health() { + // process podSandbox container type case + if isPodSandbox { + if err := processUnhealthySandbox(sandbox, container); err != nil { + return ContainerStatus{}, err + } + } else { + // If container type is pod_container, which means container operations can not be + // processed successfully, we should return the error as soon as possible + if err := container.setContainerState(types.StateUnhealthy); err != nil { + return ContainerStatus{}, err + } + + return ContainerStatus{}, fmt.Errorf("container status is unhealthy, stop container failed") + } + } } return ContainerStatus{ @@ -1016,3 +1050,36 @@ func CleanupContainer(ctx context.Context, sandboxID, containerID string, force return nil } + +// procesUnhealthySandbox only change sandbox state to unhealthy +// when caller is kata-runtime kill or kata-runtime delete +func processUnhealthySandbox(sandbox *Sandbox, container *Container) error { + // Set all containers state to unhealthy + if err := sandbox.setContainersState(types.StateUnhealthy); err != nil { + container.Logger().WithError(err).Warn("set all containers state to unhealthy fail") + } + + // Set sandbox state to unhealthy + if err := sandbox.setSandboxState(types.StateUnhealthy); err != nil { + container.Logger().WithError(err).Warn("set sandbox state to unhealthy fail") + } + + forceDelete := false + + // If process is kata-runtime kill or kata-runtime delete, + // we should kill or delete sandbox forcefully + if cmdline, err := utils.GetProcessCmdline(os.Getpid()); err != nil { + container.Logger().WithError(err).Warn("fail to get process cmdline info") + } else { + forceDelete = strings.Contains(cmdline, "kill") || strings.Contains(cmdline, "delete") + } + + if forceDelete { + // force stop podSandbox type container's kata-shim process + if err := stopShim(container.process.Pid); err != nil { + container.Logger().WithError(err).Warn("fail to stop podSandbox type container kata-shim") + } + } + + return nil +} diff --git a/virtcontainers/clh.go b/virtcontainers/clh.go index d40b698b..59510b02 100644 --- a/virtcontainers/clh.go +++ b/virtcontainers/clh.go @@ -569,7 +569,7 @@ func (clh *cloudHypervisor) resumeSandbox() error { } // stopSandbox will stop the Sandbox's VM. -func (clh *cloudHypervisor) stopSandbox() (err error) { +func (clh *cloudHypervisor) stopSandbox(force bool) (err error) { span, _ := clh.trace("stopSandbox") defer span.Finish() clh.Logger().WithField("function", "stopSandbox").Info("Stop Sandbox") diff --git a/virtcontainers/container.go b/virtcontainers/container.go index b42cc6e9..9485e708 100644 --- a/virtcontainers/container.go +++ b/virtcontainers/container.go @@ -17,8 +17,12 @@ import ( "time" "github.com/containerd/cgroups" + "github.com/kata-containers/runtime/virtcontainers/device/config" + "github.com/kata-containers/runtime/virtcontainers/device/manager" vccgroups "github.com/kata-containers/runtime/virtcontainers/pkg/cgroups" + "github.com/kata-containers/runtime/virtcontainers/pkg/rootless" vcTypes "github.com/kata-containers/runtime/virtcontainers/pkg/types" + "github.com/kata-containers/runtime/virtcontainers/store" "github.com/kata-containers/runtime/virtcontainers/types" "github.com/kata-containers/runtime/virtcontainers/utils" specs "github.com/opencontainers/runtime-spec/specs-go" @@ -26,11 +30,6 @@ import ( "github.com/pkg/errors" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" - - "github.com/kata-containers/runtime/virtcontainers/device/config" - "github.com/kata-containers/runtime/virtcontainers/device/manager" - "github.com/kata-containers/runtime/virtcontainers/pkg/rootless" - "github.com/kata-containers/runtime/virtcontainers/store" ) // https://github.com/torvalds/linux/blob/master/include/uapi/linux/major.h @@ -1047,6 +1046,13 @@ func (c *Container) stop(force bool) error { return nil } + // If container state is unhealthy, just force kill the container + if c.state.State == types.StateUnhealthy { + c.forceKillContainer() + // after force kill container, then change container state to stopped + return c.setContainerState(types.StateStopped) + } + if err := c.state.ValidTransition(c.state.State, types.StateStopped); err != nil { return err } @@ -1063,6 +1069,8 @@ func (c *Container) stop(force bool) error { if err := stopShim(c.process.Pid); err != nil { l.WithError(err).Warn("failed to stop shim") } + + c.forceKillContainer() } }() @@ -1096,7 +1104,9 @@ func (c *Container) stop(force bool) error { // this signal will ensure the container will get killed to match // the state of the shim. This will allow the following call to // stopContainer() to succeed in such particular case. - c.kill(syscall.SIGKILL, true) + if err := c.kill(syscall.SIGKILL, true); err != nil { + c.Logger().Errorf("send signal to container failed: %v", err) + } // Since the agent has supported the MultiWaitProcess, it's better to // wait the process here to make sure the process has exited before to @@ -1582,3 +1592,36 @@ func (c *Container) cgroupsUpdate(resources specs.LinuxResources) error { return nil } + +// forceDeleteContainer force clean container mount info and resources stored in the disk +func (c *Container) forceDeleteContainer() { + if err := c.unmountHostMounts(); err != nil { + c.Logger().WithError(err).Warn("container force delete umount host mounts fail") + } + + if err := c.sandbox.removeContainer(c.id); err != nil { + c.Logger().WithError(err).Warn("sandbox removeContainer fail") + } + + if err := c.store.Delete(); err != nil { + c.Logger().WithError(err).Warn("force delete container store fail") + } +} + +func (c *Container) forceKillContainer() { + if err := c.setContainerState(types.StateStopped); err != nil { + c.Logger().WithError(err).Warn("force kill container: change container state to StateStopped failed") + } + + if err := c.unmountHostMounts(); err != nil { + c.Logger().WithError(err).Warn("force kill container: umount container host mounts failed") + } + + if err := c.detachDevices(); err != nil { + c.Logger().WithError(err).Warn("force kill container: detach container devices failed") + } + + if err := c.removeDrive(); err != nil { + c.Logger().WithError(err).Warn("force kill container: remove container drive failed") + } +} diff --git a/virtcontainers/fc.go b/virtcontainers/fc.go index 97ef5ffc..72a8e192 100644 --- a/virtcontainers/fc.go +++ b/virtcontainers/fc.go @@ -864,7 +864,7 @@ func (fc *firecracker) cleanupJail() { } // stopSandbox will stop the Sandbox's VM. -func (fc *firecracker) stopSandbox() (err error) { +func (fc *firecracker) stopSandbox(force bool) (err error) { span, _ := fc.trace("stopSandbox") defer span.Finish() diff --git a/virtcontainers/hypervisor.go b/virtcontainers/hypervisor.go index 4b3dd3d0..fd7d1f8e 100644 --- a/virtcontainers/hypervisor.go +++ b/virtcontainers/hypervisor.go @@ -766,7 +766,7 @@ func generateVMSocket(id string, useVsock bool, vmStogarePath string) (interface type hypervisor interface { createSandbox(ctx context.Context, id string, networkNS NetworkNamespace, hypervisorConfig *HypervisorConfig, stateful bool) error startSandbox(timeout int) error - stopSandbox() error + stopSandbox(force bool) error pauseSandbox() error saveSandbox() error resumeSandbox() error diff --git a/virtcontainers/kata_agent.go b/virtcontainers/kata_agent.go index be5e96aa..7575d326 100644 --- a/virtcontainers/kata_agent.go +++ b/virtcontainers/kata_agent.go @@ -57,8 +57,9 @@ const ( ) var ( - checkRequestTimeout = 30 * time.Second - defaultRequestTimeout = 60 * time.Second + checkRequestTimeout = 10 * time.Second + defaultRequestTimeout = 10 * time.Second + createContainerTimeout = 120 * time.Second errorMissingProxy = errors.New("Missing proxy pointer") errorMissingOCISpec = errors.New("Missing OCI specification") defaultKataHostSharedDir = "/run/kata-containers/shared/sandboxes/" @@ -987,17 +988,21 @@ func (k *kataAgent) stopSandbox(sandbox *Sandbox) error { k.state.URL = "" }() - req := &grpc.DestroySandboxRequest{} + // If sandbox.state.State is unhealthy, we don't need to send DestroySandboxRequest + // to kata-agent, just force stop the sandbox + if sandbox.state.State != types.StateUnhealthy { + req := &grpc.DestroySandboxRequest{} - if _, err := k.sendReq(req); err != nil { - return err - } - - if k.dynamicTracing { - _, err := k.sendReq(&grpc.StopTracingRequest{}) - if err != nil { + if _, err := k.sendReq(req); err != nil { return err } + + if k.dynamicTracing { + _, err := k.sendReq(&grpc.StopTracingRequest{}) + if err != nil { + return err + } + } } return nil @@ -2062,6 +2067,8 @@ func (k *kataAgent) getReqContext(reqName string) (ctx context.Context, cancel c // Wait has no timeout case grpcCheckRequest: ctx, cancel = context.WithTimeout(ctx, checkRequestTimeout) + case grpcCreateContainerRequest: + ctx, cancel = context.WithTimeout(ctx, createContainerTimeout) default: ctx, cancel = context.WithTimeout(ctx, defaultRequestTimeout) } @@ -2382,3 +2389,7 @@ func (k *kataAgent) load(s persistapi.AgentState) { k.state.ProxyPid = s.ProxyPid k.state.URL = s.URL } + +func (k *kataAgent) getProxyPid() int { + return k.state.ProxyPid +} diff --git a/virtcontainers/mock_hypervisor.go b/virtcontainers/mock_hypervisor.go index 0c84e43c..a5b67491 100644 --- a/virtcontainers/mock_hypervisor.go +++ b/virtcontainers/mock_hypervisor.go @@ -39,7 +39,7 @@ func (m *mockHypervisor) startSandbox(timeout int) error { return nil } -func (m *mockHypervisor) stopSandbox() error { +func (m *mockHypervisor) stopSandbox(force bool) error { return nil } diff --git a/virtcontainers/mock_hypervisor_test.go b/virtcontainers/mock_hypervisor_test.go index b73b28f2..827e3192 100644 --- a/virtcontainers/mock_hypervisor_test.go +++ b/virtcontainers/mock_hypervisor_test.go @@ -53,7 +53,7 @@ func TestMockHypervisorStartSandbox(t *testing.T) { func TestMockHypervisorStopSandbox(t *testing.T) { var m *mockHypervisor - assert.NoError(t, m.stopSandbox()) + assert.NoError(t, m.stopSandbox(false)) } func TestMockHypervisorAddDevice(t *testing.T) { diff --git a/virtcontainers/noop_agent.go b/virtcontainers/noop_agent.go index 189f6b3f..8a7cd337 100644 --- a/virtcontainers/noop_agent.go +++ b/virtcontainers/noop_agent.go @@ -236,3 +236,7 @@ func (n *noopAgent) save() (s persistapi.AgentState) { // load is the Noop agent state loader. It does nothing. func (n *noopAgent) load(s persistapi.AgentState) {} + +func (n *noopAgent) getProxyPid() int { + return -1 +} \ No newline at end of file diff --git a/virtcontainers/pkg/oci/utils.go b/virtcontainers/pkg/oci/utils.go index 5348c57d..cd8d48ce 100644 --- a/virtcontainers/pkg/oci/utils.go +++ b/virtcontainers/pkg/oci/utils.go @@ -70,6 +70,9 @@ const ( // StatePaused represents a container that has been paused. StatePaused = "paused" + + // StateUnhealthy represents a container that is unhealthy + StateUnhealthy = "unhealthy" ) const KernelModulesSeparator = ";" @@ -964,6 +967,8 @@ func StateToOCIState(state types.StateString) string { return StateStopped case types.StatePaused: return StatePaused + case types.StateUnhealthy: + return StateUnhealthy default: return "" } diff --git a/virtcontainers/qemu.go b/virtcontainers/qemu.go index ca286550..4b15d968 100644 --- a/virtcontainers/qemu.go +++ b/virtcontainers/qemu.go @@ -687,7 +687,7 @@ func (q *qemu) setupVirtiofsd() (err error) { q.Logger().Info("virtiofsd quits") // Wait to release resources of virtiofsd process cmd.Process.Wait() - q.stopSandbox() + q.stopSandbox(false) }() return err } @@ -922,11 +922,11 @@ func (q *qemu) waitSandbox(timeout int) error { } // stopSandbox will stop the Sandbox's VM. -func (q *qemu) stopSandbox() error { +func (q *qemu) stopSandbox(force bool) error { span, _ := q.trace("stopSandbox") defer span.Finish() - q.Logger().Info("Stopping Sandbox") + q.Logger().Infof("force stopping Sandbox: %v", force) if q.stopped { q.Logger().Info("Already stopped") return nil @@ -937,28 +937,37 @@ func (q *qemu) stopSandbox() error { q.stopped = true }() - if q.config.Debug && q.qemuConfig.LogFile != "" { - f, err := os.OpenFile(q.qemuConfig.LogFile, os.O_RDONLY, 0) - if err == nil { - scanner := bufio.NewScanner(f) - for scanner.Scan() { - q.Logger().Debug(scanner.Text()) - } - if err := scanner.Err(); err != nil { - q.Logger().WithError(err).Debug("read qemu log failed") + if !force { + if q.config.Debug && q.qemuConfig.LogFile != "" { + f, err := os.OpenFile(q.qemuConfig.LogFile, os.O_RDONLY, 0) + if err == nil { + scanner := bufio.NewScanner(f) + for scanner.Scan() { + q.Logger().Debug(scanner.Text()) + } + if err := scanner.Err(); err != nil { + q.Logger().WithError(err).Debug("read qemu log failed") + } } } - } - err := q.qmpSetup() - if err != nil { - return err - } + err := q.qmpSetup() + if err != nil { + return err + } - err = q.qmpMonitorCh.qmp.ExecuteQuit(q.qmpMonitorCh.ctx) - if err != nil { - q.Logger().WithError(err).Error("Fail to execute qmp QUIT") - return err + err = q.qmpMonitorCh.qmp.ExecuteQuit(q.qmpMonitorCh.ctx) + if err != nil { + q.Logger().WithError(err).Error("Fail to execute qmp QUIT") + return err + } + } else { + qemuMainPid := q.getPids()[0] + if qemuMainPid <= 1 { + return fmt.Errorf("force kill qemu process pid is invalid") + } + + _ = syscall.Kill(qemuMainPid, syscall.SIGKILL) } return nil diff --git a/virtcontainers/sandbox.go b/virtcontainers/sandbox.go index edd1af5b..78188ed7 100644 --- a/virtcontainers/sandbox.go +++ b/virtcontainers/sandbox.go @@ -12,19 +12,13 @@ import ( "math" "net" "os" + "path/filepath" "strings" "sync" "syscall" "github.com/containerd/cgroups" "github.com/containernetworking/plugins/pkg/ns" - "github.com/opencontainers/runc/libcontainer/configs" - specs "github.com/opencontainers/runtime-spec/specs-go" - opentracing "github.com/opentracing/opentracing-go" - "github.com/pkg/errors" - "github.com/sirupsen/logrus" - "github.com/vishvananda/netlink" - "github.com/kata-containers/agent/protocols/grpc" "github.com/kata-containers/runtime/virtcontainers/device/api" "github.com/kata-containers/runtime/virtcontainers/device/config" @@ -41,6 +35,12 @@ import ( "github.com/kata-containers/runtime/virtcontainers/store" "github.com/kata-containers/runtime/virtcontainers/types" "github.com/kata-containers/runtime/virtcontainers/utils" + "github.com/opencontainers/runc/libcontainer/configs" + specs "github.com/opencontainers/runtime-spec/specs-go" + opentracing "github.com/opentracing/opentracing-go" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" + "github.com/vishvananda/netlink" ) const ( @@ -50,6 +50,9 @@ const ( // DirMode is the permission bits used for creating a directory DirMode = os.FileMode(0750) | os.ModeDir + + // kata-proxy proces name + KataProxyProcessName = "kata-proxy" ) // SandboxStatus describes a sandbox status. @@ -1037,7 +1040,7 @@ func (s *Sandbox) startVM() (err error) { defer func() { if err != nil { - s.hypervisor.stopSandbox() + s.hypervisor.stopSandbox(false) } }() @@ -1090,7 +1093,12 @@ func (s *Sandbox) stopVM() error { } s.Logger().Info("Stopping VM") - return s.hypervisor.stopSandbox() + forceStop := false + if s.state.State == types.StateUnhealthy { + forceStop = true + } + + return s.hypervisor.stopSandbox(forceStop) } func (s *Sandbox) addContainer(c *Container) error { @@ -1591,13 +1599,15 @@ func (s *Sandbox) setSandboxState(state types.StateString) error { return vcTypes.ErrNeedState } + s.Logger().Debugf("Setting sandbox state from %v to %v", s.state.State, state) // update in-memory state s.state.State = state if useOldStore(s.ctx) { return s.store.Store(store.State, s.state) + } else { + return s.Save() } - return nil } const maxBlockIndex = 65535 @@ -2207,3 +2217,79 @@ func (s *Sandbox) GetPatchedOCISpec() *specs.Spec { return nil } + +// health return current sandbox healthy or not +// If qemu/kata-proxy/kata-agent process is abnormal, +// s.agent.check() will return false +func (s *Sandbox) health() bool { + err := s.agent.check() + if err != nil { + return false + } + + return true +} + +// shouldForceDelete force delete the sandbox when kata-proxy and hypervisor process exit +// already and current process is kata-runtime kill or kata-runtime delete +func (s *Sandbox) shouldForceDelete() bool { + cmdline, err := utils.GetProcessCmdline(os.Getpid()) + if err != nil { + s.Logger().Errorf("fail to get process cmdline: %v", err) + return false + } + + proxyPid := s.agent.getProxyPid() + hypervisorPids := s.hypervisor.getPids() + if len(hypervisorPids) <= 0 { + s.Logger().Warnf("get hypervisor main pid fail") + return false + } + hypervisorMainPid := hypervisorPids[0] + hypervisorPath := s.hypervisor.hypervisorConfig().HypervisorPath + hypervisorName := filepath.Base(hypervisorPath) + + if !utils.IsProcessRunning(proxyPid, KataProxyProcessName, s.id) && !utils.IsProcessRunning(hypervisorMainPid, hypervisorName, s.id) && + strings.Contains(cmdline, "delete") && strings.Contains(cmdline, "force") { + return true + } + + return false +} + +func (s *Sandbox) forceDeleteSandbox() { + for _, c := range s.containers { + // force delete all containers in the sandbox + c.forceDeleteContainer() + } + + globalSandboxList.removeSandbox(s.id) + + if s.monitor != nil { + s.monitor.stop() + } + + if err := s.hypervisor.cleanup(); err != nil { + s.Logger().WithError(err).Error("failed to force cleanup hypervisor resource") + } + + s.agent.cleanup(s) + + if err := s.store.Delete(); err != nil { + s.Logger().WithError(err).Warn("sandbox force delete store failed") + } +} + +func (s *Sandbox) setContainersState(state types.StateString) error { + if state == "" { + return vcTypes.ErrNeedState + } + + for _, c := range s.containers { + if err := c.setContainerState(state); err != nil { + return err + } + } + + return nil +} diff --git a/virtcontainers/types/sandbox.go b/virtcontainers/types/sandbox.go index 3b64b20a..5d586b21 100644 --- a/virtcontainers/types/sandbox.go +++ b/virtcontainers/types/sandbox.go @@ -28,6 +28,9 @@ const ( // StateStopped represents a sandbox/container that has been stopped. StateStopped StateString = "stopped" + + // StateUnhealthy represents a sandbox/container that's in abnormal state. + StateUnhealthy StateString = "unhealthy" ) const ( @@ -90,17 +93,17 @@ func (state *StateString) validTransition(oldState StateString, newState StateSt switch *state { case StateReady: - if newState == StateRunning || newState == StateStopped { + if newState == StateRunning || newState == StateStopped || newState == StateUnhealthy { return nil } case StateRunning: - if newState == StatePaused || newState == StateStopped { + if newState == StatePaused || newState == StateStopped || newState == StateUnhealthy { return nil } case StatePaused: - if newState == StateRunning || newState == StateStopped { + if newState == StateRunning || newState == StateStopped || newState == StateUnhealthy { return nil } @@ -108,6 +111,11 @@ func (state *StateString) validTransition(oldState StateString, newState StateSt if newState == StateRunning { return nil } + + case StateUnhealthy: + if newState == StateStopped { + return nil + } } return fmt.Errorf("Can not move from %v to %v", diff --git a/virtcontainers/utils/utils.go b/virtcontainers/utils/utils.go index 85c55489..2b555ebb 100644 --- a/virtcontainers/utils/utils.go +++ b/virtcontainers/utils/utils.go @@ -9,9 +9,13 @@ import ( "crypto/rand" "errors" "fmt" + "io/ioutil" "os" "os/exec" "path/filepath" + "strconv" + "strings" + "syscall" ) const cpBinaryName = "cp" @@ -275,3 +279,45 @@ const ( MiB = KiB << 10 GiB = MiB << 10 ) + +// Get process cmdline info by read /proc//cmdline file +func GetProcessCmdline(pid int) (cmdline string, err error) { + if pid <= 1 { + return "", fmt.Errorf("invalid pid number") + } + + bytes, err := ioutil.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "cmdline")) + if err != nil { + return "", err + } + + return string(bytes), nil +} + +func IsProcessRunning(pid int, processName string, sandboxID string) bool { + if pid <= 0 { + return false + } + + process, err := os.FindProcess(pid) + if err != nil { + return false + } + + if err := process.Signal(syscall.Signal(0)); err != nil { + return false + } + + cmdline, err := GetProcessCmdline(pid) + if err != nil { + return false + } + + // If process's cmdline contains processName and sandboxID keyword, + // We think this process isn't be reused + if strings.Contains(cmdline, processName) && strings.Contains(cmdline, sandboxID) { + return true + } + + return false +} diff --git a/virtcontainers/vm.go b/virtcontainers/vm.go index fcda1e97..8d27b1fe 100644 --- a/virtcontainers/vm.go +++ b/virtcontainers/vm.go @@ -191,7 +191,7 @@ func NewVM(ctx context.Context, config VMConfig) (*VM, error) { defer func() { if err != nil { virtLog.WithField("vm", id).WithError(err).Info("clean up vm") - hypervisor.stopSandbox() + hypervisor.stopSandbox(false) } }() @@ -333,7 +333,7 @@ func (v *VM) Disconnect() error { func (v *VM) Stop() error { v.logger().Info("stop vm") - if err := v.hypervisor.stopSandbox(); err != nil { + if err := v.hypervisor.stopSandbox(false); err != nil { return err } -- 2.14.3 (Apple Git-98)