kata-containers/runtime/patches/0007-kata-runtime-enhance-reliability-when-kata-related-p.patch
holyfei c709612f2a kata-containers: modify kata-containers version
Fix #I4KI81
reason: modify kata-containers version and update
it to 1.11.1

Signed-off-by: holyfei <yangfeiyu20092010@163.com>
2021-11-30 20:08:25 +08:00

856 lines
27 KiB
Diff

From d93da1875ed7f1a6061cffb13475506d73c86003 Mon Sep 17 00:00:00 2001
From: jiangpengfei <jiangpengfei9@huawei.com>
Date: Sat, 25 Jul 2020 16:04:19 +0800
Subject: [PATCH 07/50] kata-runtime: enhance reliability when kata related
process
reason: enhance the reliability when kata related processes is abnormal,
make kata-container still destroy the sandbox and clean up all resources.
Signed-off-by: jiangpengfei <jiangpengfei9@huawei.com>
---
cli/delete.go | 6 ++
cli/kill.go | 3 +-
virtcontainers/acrn.go | 2 +-
virtcontainers/agent.go | 3 +
virtcontainers/api.go | 67 +++++++++++++++++++++
virtcontainers/clh.go | 2 +-
virtcontainers/container.go | 55 +++++++++++++++--
virtcontainers/fc.go | 2 +-
virtcontainers/hypervisor.go | 2 +-
virtcontainers/kata_agent.go | 31 ++++++----
virtcontainers/mock_hypervisor.go | 2 +-
virtcontainers/mock_hypervisor_test.go | 2 +-
virtcontainers/noop_agent.go | 4 ++
virtcontainers/pkg/oci/utils.go | 5 ++
virtcontainers/qemu.go | 51 +++++++++-------
virtcontainers/sandbox.go | 106 +++++++++++++++++++++++++++++----
virtcontainers/types/sandbox.go | 14 ++++-
virtcontainers/utils/utils.go | 46 ++++++++++++++
virtcontainers/vm.go | 4 +-
19 files changed, 348 insertions(+), 59 deletions(-)
diff --git a/cli/delete.go b/cli/delete.go
index c2ce52a4..2f5586e5 100644
--- a/cli/delete.go
+++ b/cli/delete.go
@@ -110,6 +110,12 @@ func delete(ctx context.Context, containerID string, force bool) error {
forceStop = true
}
+ if oci.StateToOCIState(status.State.State) == oci.StateUnhealthy {
+ // Set forceStop and force bool flag to true to force delete everything
+ forceStop = true
+ force = true
+ }
+
switch containerType {
case vc.PodSandbox:
if err := deleteSandbox(ctx, sandboxID, force); err != nil {
diff --git a/cli/kill.go b/cli/kill.go
index 60fa41e0..b228205f 100644
--- a/cli/kill.go
+++ b/cli/kill.go
@@ -133,11 +133,12 @@ func kill(ctx context.Context, containerID, signal string, all bool) error {
kataLog.WithField("signal", signal).WithField("container state", status.State.State).Info("kill")
// container MUST be created, running or paused
+ // If container state is unhealthy, should process this exceptional case separately
if status.State.State == types.StateReady || status.State.State == types.StateRunning || status.State.State == types.StatePaused {
if err := vci.KillContainer(ctx, sandboxID, containerID, signum, all); err != nil {
return err
}
- } else if !all {
+ } else if !all && status.State.State != types.StateUnhealthy {
return fmt.Errorf("container not running")
}
diff --git a/virtcontainers/acrn.go b/virtcontainers/acrn.go
index 761eda03..10cae06f 100644
--- a/virtcontainers/acrn.go
+++ b/virtcontainers/acrn.go
@@ -475,7 +475,7 @@ func (a *Acrn) waitSandbox(timeoutSecs int) error {
}
// stopSandbox will stop the Sandbox's VM.
-func (a *Acrn) stopSandbox() (err error) {
+func (a *Acrn) stopSandbox(force bool) (err error) {
span, _ := a.trace("stopSandbox")
defer span.Finish()
diff --git a/virtcontainers/agent.go b/virtcontainers/agent.go
index c62107ec..be9526c7 100644
--- a/virtcontainers/agent.go
+++ b/virtcontainers/agent.go
@@ -259,4 +259,7 @@ type agent interface {
// load data from disk
load(persistapi.AgentState)
+
+ // get proxy process pid
+ getProxyPid() int
}
diff --git a/virtcontainers/api.go b/virtcontainers/api.go
index de569713..fa82d163 100644
--- a/virtcontainers/api.go
+++ b/virtcontainers/api.go
@@ -7,8 +7,10 @@ package virtcontainers
import (
"context"
+ "fmt"
"os"
"runtime"
+ "strings"
"syscall"
deviceApi "github.com/kata-containers/runtime/virtcontainers/device/api"
@@ -18,6 +20,7 @@ import (
vcTypes "github.com/kata-containers/runtime/virtcontainers/pkg/types"
"github.com/kata-containers/runtime/virtcontainers/store"
"github.com/kata-containers/runtime/virtcontainers/types"
+ "github.com/kata-containers/runtime/virtcontainers/utils"
specs "github.com/opencontainers/runtime-spec/specs-go"
opentracing "github.com/opentracing/opentracing-go"
"github.com/sirupsen/logrus"
@@ -597,20 +600,51 @@ func statusContainer(sandbox *Sandbox, containerID string) (ContainerStatus, err
container.state.State == types.StatePaused) &&
container.process.Pid > 0 {
+ // If container state is active, however kata-proxy and qemu process all exit already
+ // which means sandbox has beed stopped exceptionally, then we should force delete
+ // sandbox and container state files in sandbox.Store
+ if sandbox.shouldForceDelete() {
+ virtLog.Logger.Warn("sandbox status is abnormal, sandbox should be force deleted")
+ sandbox.forceDeleteSandbox()
+ return ContainerStatus{}, fmt.Errorf("sandbox has beed stopped exceptionally")
+ }
+
running, err := isShimRunning(container.process.Pid)
if err != nil {
return ContainerStatus{}, err
}
+ // If kata-shim process exit or be killed, need to stop the container
if !running {
virtLog.WithFields(logrus.Fields{
"state": container.state.State,
"pid": container.process.Pid}).
Info("container isn't running")
+
if err := container.stop(true); err != nil {
return ContainerStatus{}, err
}
}
+
+ isPodSandbox := (containerID == sandbox.id)
+
+ // If sandbox is unhealthy, process it correctly
+ if !sandbox.health() {
+ // process podSandbox container type case
+ if isPodSandbox {
+ if err := processUnhealthySandbox(sandbox, container); err != nil {
+ return ContainerStatus{}, err
+ }
+ } else {
+ // If container type is pod_container, which means container operations can not be
+ // processed successfully, we should return the error as soon as possible
+ if err := container.setContainerState(types.StateUnhealthy); err != nil {
+ return ContainerStatus{}, err
+ }
+
+ return ContainerStatus{}, fmt.Errorf("container status is unhealthy, stop container failed")
+ }
+ }
}
return ContainerStatus{
@@ -1016,3 +1050,36 @@ func CleanupContainer(ctx context.Context, sandboxID, containerID string, force
return nil
}
+
+// procesUnhealthySandbox only change sandbox state to unhealthy
+// when caller is kata-runtime kill or kata-runtime delete
+func processUnhealthySandbox(sandbox *Sandbox, container *Container) error {
+ // Set all containers state to unhealthy
+ if err := sandbox.setContainersState(types.StateUnhealthy); err != nil {
+ container.Logger().WithError(err).Warn("set all containers state to unhealthy fail")
+ }
+
+ // Set sandbox state to unhealthy
+ if err := sandbox.setSandboxState(types.StateUnhealthy); err != nil {
+ container.Logger().WithError(err).Warn("set sandbox state to unhealthy fail")
+ }
+
+ forceDelete := false
+
+ // If process is kata-runtime kill or kata-runtime delete,
+ // we should kill or delete sandbox forcefully
+ if cmdline, err := utils.GetProcessCmdline(os.Getpid()); err != nil {
+ container.Logger().WithError(err).Warn("fail to get process cmdline info")
+ } else {
+ forceDelete = strings.Contains(cmdline, "kill") || strings.Contains(cmdline, "delete")
+ }
+
+ if forceDelete {
+ // force stop podSandbox type container's kata-shim process
+ if err := stopShim(container.process.Pid); err != nil {
+ container.Logger().WithError(err).Warn("fail to stop podSandbox type container kata-shim")
+ }
+ }
+
+ return nil
+}
diff --git a/virtcontainers/clh.go b/virtcontainers/clh.go
index d40b698b..59510b02 100644
--- a/virtcontainers/clh.go
+++ b/virtcontainers/clh.go
@@ -569,7 +569,7 @@ func (clh *cloudHypervisor) resumeSandbox() error {
}
// stopSandbox will stop the Sandbox's VM.
-func (clh *cloudHypervisor) stopSandbox() (err error) {
+func (clh *cloudHypervisor) stopSandbox(force bool) (err error) {
span, _ := clh.trace("stopSandbox")
defer span.Finish()
clh.Logger().WithField("function", "stopSandbox").Info("Stop Sandbox")
diff --git a/virtcontainers/container.go b/virtcontainers/container.go
index b42cc6e9..9485e708 100644
--- a/virtcontainers/container.go
+++ b/virtcontainers/container.go
@@ -17,8 +17,12 @@ import (
"time"
"github.com/containerd/cgroups"
+ "github.com/kata-containers/runtime/virtcontainers/device/config"
+ "github.com/kata-containers/runtime/virtcontainers/device/manager"
vccgroups "github.com/kata-containers/runtime/virtcontainers/pkg/cgroups"
+ "github.com/kata-containers/runtime/virtcontainers/pkg/rootless"
vcTypes "github.com/kata-containers/runtime/virtcontainers/pkg/types"
+ "github.com/kata-containers/runtime/virtcontainers/store"
"github.com/kata-containers/runtime/virtcontainers/types"
"github.com/kata-containers/runtime/virtcontainers/utils"
specs "github.com/opencontainers/runtime-spec/specs-go"
@@ -26,11 +30,6 @@ import (
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
-
- "github.com/kata-containers/runtime/virtcontainers/device/config"
- "github.com/kata-containers/runtime/virtcontainers/device/manager"
- "github.com/kata-containers/runtime/virtcontainers/pkg/rootless"
- "github.com/kata-containers/runtime/virtcontainers/store"
)
// https://github.com/torvalds/linux/blob/master/include/uapi/linux/major.h
@@ -1047,6 +1046,13 @@ func (c *Container) stop(force bool) error {
return nil
}
+ // If container state is unhealthy, just force kill the container
+ if c.state.State == types.StateUnhealthy {
+ c.forceKillContainer()
+ // after force kill container, then change container state to stopped
+ return c.setContainerState(types.StateStopped)
+ }
+
if err := c.state.ValidTransition(c.state.State, types.StateStopped); err != nil {
return err
}
@@ -1063,6 +1069,8 @@ func (c *Container) stop(force bool) error {
if err := stopShim(c.process.Pid); err != nil {
l.WithError(err).Warn("failed to stop shim")
}
+
+ c.forceKillContainer()
}
}()
@@ -1096,7 +1104,9 @@ func (c *Container) stop(force bool) error {
// this signal will ensure the container will get killed to match
// the state of the shim. This will allow the following call to
// stopContainer() to succeed in such particular case.
- c.kill(syscall.SIGKILL, true)
+ if err := c.kill(syscall.SIGKILL, true); err != nil {
+ c.Logger().Errorf("send signal to container failed: %v", err)
+ }
// Since the agent has supported the MultiWaitProcess, it's better to
// wait the process here to make sure the process has exited before to
@@ -1582,3 +1592,36 @@ func (c *Container) cgroupsUpdate(resources specs.LinuxResources) error {
return nil
}
+
+// forceDeleteContainer force clean container mount info and resources stored in the disk
+func (c *Container) forceDeleteContainer() {
+ if err := c.unmountHostMounts(); err != nil {
+ c.Logger().WithError(err).Warn("container force delete umount host mounts fail")
+ }
+
+ if err := c.sandbox.removeContainer(c.id); err != nil {
+ c.Logger().WithError(err).Warn("sandbox removeContainer fail")
+ }
+
+ if err := c.store.Delete(); err != nil {
+ c.Logger().WithError(err).Warn("force delete container store fail")
+ }
+}
+
+func (c *Container) forceKillContainer() {
+ if err := c.setContainerState(types.StateStopped); err != nil {
+ c.Logger().WithError(err).Warn("force kill container: change container state to StateStopped failed")
+ }
+
+ if err := c.unmountHostMounts(); err != nil {
+ c.Logger().WithError(err).Warn("force kill container: umount container host mounts failed")
+ }
+
+ if err := c.detachDevices(); err != nil {
+ c.Logger().WithError(err).Warn("force kill container: detach container devices failed")
+ }
+
+ if err := c.removeDrive(); err != nil {
+ c.Logger().WithError(err).Warn("force kill container: remove container drive failed")
+ }
+}
diff --git a/virtcontainers/fc.go b/virtcontainers/fc.go
index 97ef5ffc..72a8e192 100644
--- a/virtcontainers/fc.go
+++ b/virtcontainers/fc.go
@@ -864,7 +864,7 @@ func (fc *firecracker) cleanupJail() {
}
// stopSandbox will stop the Sandbox's VM.
-func (fc *firecracker) stopSandbox() (err error) {
+func (fc *firecracker) stopSandbox(force bool) (err error) {
span, _ := fc.trace("stopSandbox")
defer span.Finish()
diff --git a/virtcontainers/hypervisor.go b/virtcontainers/hypervisor.go
index 4b3dd3d0..fd7d1f8e 100644
--- a/virtcontainers/hypervisor.go
+++ b/virtcontainers/hypervisor.go
@@ -766,7 +766,7 @@ func generateVMSocket(id string, useVsock bool, vmStogarePath string) (interface
type hypervisor interface {
createSandbox(ctx context.Context, id string, networkNS NetworkNamespace, hypervisorConfig *HypervisorConfig, stateful bool) error
startSandbox(timeout int) error
- stopSandbox() error
+ stopSandbox(force bool) error
pauseSandbox() error
saveSandbox() error
resumeSandbox() error
diff --git a/virtcontainers/kata_agent.go b/virtcontainers/kata_agent.go
index be5e96aa..7575d326 100644
--- a/virtcontainers/kata_agent.go
+++ b/virtcontainers/kata_agent.go
@@ -57,8 +57,9 @@ const (
)
var (
- checkRequestTimeout = 30 * time.Second
- defaultRequestTimeout = 60 * time.Second
+ checkRequestTimeout = 10 * time.Second
+ defaultRequestTimeout = 10 * time.Second
+ createContainerTimeout = 120 * time.Second
errorMissingProxy = errors.New("Missing proxy pointer")
errorMissingOCISpec = errors.New("Missing OCI specification")
defaultKataHostSharedDir = "/run/kata-containers/shared/sandboxes/"
@@ -987,17 +988,21 @@ func (k *kataAgent) stopSandbox(sandbox *Sandbox) error {
k.state.URL = ""
}()
- req := &grpc.DestroySandboxRequest{}
+ // If sandbox.state.State is unhealthy, we don't need to send DestroySandboxRequest
+ // to kata-agent, just force stop the sandbox
+ if sandbox.state.State != types.StateUnhealthy {
+ req := &grpc.DestroySandboxRequest{}
- if _, err := k.sendReq(req); err != nil {
- return err
- }
-
- if k.dynamicTracing {
- _, err := k.sendReq(&grpc.StopTracingRequest{})
- if err != nil {
+ if _, err := k.sendReq(req); err != nil {
return err
}
+
+ if k.dynamicTracing {
+ _, err := k.sendReq(&grpc.StopTracingRequest{})
+ if err != nil {
+ return err
+ }
+ }
}
return nil
@@ -2062,6 +2067,8 @@ func (k *kataAgent) getReqContext(reqName string) (ctx context.Context, cancel c
// Wait has no timeout
case grpcCheckRequest:
ctx, cancel = context.WithTimeout(ctx, checkRequestTimeout)
+ case grpcCreateContainerRequest:
+ ctx, cancel = context.WithTimeout(ctx, createContainerTimeout)
default:
ctx, cancel = context.WithTimeout(ctx, defaultRequestTimeout)
}
@@ -2382,3 +2389,7 @@ func (k *kataAgent) load(s persistapi.AgentState) {
k.state.ProxyPid = s.ProxyPid
k.state.URL = s.URL
}
+
+func (k *kataAgent) getProxyPid() int {
+ return k.state.ProxyPid
+}
diff --git a/virtcontainers/mock_hypervisor.go b/virtcontainers/mock_hypervisor.go
index 0c84e43c..a5b67491 100644
--- a/virtcontainers/mock_hypervisor.go
+++ b/virtcontainers/mock_hypervisor.go
@@ -39,7 +39,7 @@ func (m *mockHypervisor) startSandbox(timeout int) error {
return nil
}
-func (m *mockHypervisor) stopSandbox() error {
+func (m *mockHypervisor) stopSandbox(force bool) error {
return nil
}
diff --git a/virtcontainers/mock_hypervisor_test.go b/virtcontainers/mock_hypervisor_test.go
index b73b28f2..827e3192 100644
--- a/virtcontainers/mock_hypervisor_test.go
+++ b/virtcontainers/mock_hypervisor_test.go
@@ -53,7 +53,7 @@ func TestMockHypervisorStartSandbox(t *testing.T) {
func TestMockHypervisorStopSandbox(t *testing.T) {
var m *mockHypervisor
- assert.NoError(t, m.stopSandbox())
+ assert.NoError(t, m.stopSandbox(false))
}
func TestMockHypervisorAddDevice(t *testing.T) {
diff --git a/virtcontainers/noop_agent.go b/virtcontainers/noop_agent.go
index 189f6b3f..8a7cd337 100644
--- a/virtcontainers/noop_agent.go
+++ b/virtcontainers/noop_agent.go
@@ -236,3 +236,7 @@ func (n *noopAgent) save() (s persistapi.AgentState) {
// load is the Noop agent state loader. It does nothing.
func (n *noopAgent) load(s persistapi.AgentState) {}
+
+func (n *noopAgent) getProxyPid() int {
+ return -1
+}
\ No newline at end of file
diff --git a/virtcontainers/pkg/oci/utils.go b/virtcontainers/pkg/oci/utils.go
index 5348c57d..cd8d48ce 100644
--- a/virtcontainers/pkg/oci/utils.go
+++ b/virtcontainers/pkg/oci/utils.go
@@ -70,6 +70,9 @@ const (
// StatePaused represents a container that has been paused.
StatePaused = "paused"
+
+ // StateUnhealthy represents a container that is unhealthy
+ StateUnhealthy = "unhealthy"
)
const KernelModulesSeparator = ";"
@@ -964,6 +967,8 @@ func StateToOCIState(state types.StateString) string {
return StateStopped
case types.StatePaused:
return StatePaused
+ case types.StateUnhealthy:
+ return StateUnhealthy
default:
return ""
}
diff --git a/virtcontainers/qemu.go b/virtcontainers/qemu.go
index ca286550..4b15d968 100644
--- a/virtcontainers/qemu.go
+++ b/virtcontainers/qemu.go
@@ -687,7 +687,7 @@ func (q *qemu) setupVirtiofsd() (err error) {
q.Logger().Info("virtiofsd quits")
// Wait to release resources of virtiofsd process
cmd.Process.Wait()
- q.stopSandbox()
+ q.stopSandbox(false)
}()
return err
}
@@ -922,11 +922,11 @@ func (q *qemu) waitSandbox(timeout int) error {
}
// stopSandbox will stop the Sandbox's VM.
-func (q *qemu) stopSandbox() error {
+func (q *qemu) stopSandbox(force bool) error {
span, _ := q.trace("stopSandbox")
defer span.Finish()
- q.Logger().Info("Stopping Sandbox")
+ q.Logger().Infof("force stopping Sandbox: %v", force)
if q.stopped {
q.Logger().Info("Already stopped")
return nil
@@ -937,28 +937,37 @@ func (q *qemu) stopSandbox() error {
q.stopped = true
}()
- if q.config.Debug && q.qemuConfig.LogFile != "" {
- f, err := os.OpenFile(q.qemuConfig.LogFile, os.O_RDONLY, 0)
- if err == nil {
- scanner := bufio.NewScanner(f)
- for scanner.Scan() {
- q.Logger().Debug(scanner.Text())
- }
- if err := scanner.Err(); err != nil {
- q.Logger().WithError(err).Debug("read qemu log failed")
+ if !force {
+ if q.config.Debug && q.qemuConfig.LogFile != "" {
+ f, err := os.OpenFile(q.qemuConfig.LogFile, os.O_RDONLY, 0)
+ if err == nil {
+ scanner := bufio.NewScanner(f)
+ for scanner.Scan() {
+ q.Logger().Debug(scanner.Text())
+ }
+ if err := scanner.Err(); err != nil {
+ q.Logger().WithError(err).Debug("read qemu log failed")
+ }
}
}
- }
- err := q.qmpSetup()
- if err != nil {
- return err
- }
+ err := q.qmpSetup()
+ if err != nil {
+ return err
+ }
- err = q.qmpMonitorCh.qmp.ExecuteQuit(q.qmpMonitorCh.ctx)
- if err != nil {
- q.Logger().WithError(err).Error("Fail to execute qmp QUIT")
- return err
+ err = q.qmpMonitorCh.qmp.ExecuteQuit(q.qmpMonitorCh.ctx)
+ if err != nil {
+ q.Logger().WithError(err).Error("Fail to execute qmp QUIT")
+ return err
+ }
+ } else {
+ qemuMainPid := q.getPids()[0]
+ if qemuMainPid <= 1 {
+ return fmt.Errorf("force kill qemu process pid is invalid")
+ }
+
+ _ = syscall.Kill(qemuMainPid, syscall.SIGKILL)
}
return nil
diff --git a/virtcontainers/sandbox.go b/virtcontainers/sandbox.go
index edd1af5b..78188ed7 100644
--- a/virtcontainers/sandbox.go
+++ b/virtcontainers/sandbox.go
@@ -12,19 +12,13 @@ import (
"math"
"net"
"os"
+ "path/filepath"
"strings"
"sync"
"syscall"
"github.com/containerd/cgroups"
"github.com/containernetworking/plugins/pkg/ns"
- "github.com/opencontainers/runc/libcontainer/configs"
- specs "github.com/opencontainers/runtime-spec/specs-go"
- opentracing "github.com/opentracing/opentracing-go"
- "github.com/pkg/errors"
- "github.com/sirupsen/logrus"
- "github.com/vishvananda/netlink"
-
"github.com/kata-containers/agent/protocols/grpc"
"github.com/kata-containers/runtime/virtcontainers/device/api"
"github.com/kata-containers/runtime/virtcontainers/device/config"
@@ -41,6 +35,12 @@ import (
"github.com/kata-containers/runtime/virtcontainers/store"
"github.com/kata-containers/runtime/virtcontainers/types"
"github.com/kata-containers/runtime/virtcontainers/utils"
+ "github.com/opencontainers/runc/libcontainer/configs"
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ opentracing "github.com/opentracing/opentracing-go"
+ "github.com/pkg/errors"
+ "github.com/sirupsen/logrus"
+ "github.com/vishvananda/netlink"
)
const (
@@ -50,6 +50,9 @@ const (
// DirMode is the permission bits used for creating a directory
DirMode = os.FileMode(0750) | os.ModeDir
+
+ // kata-proxy proces name
+ KataProxyProcessName = "kata-proxy"
)
// SandboxStatus describes a sandbox status.
@@ -1037,7 +1040,7 @@ func (s *Sandbox) startVM() (err error) {
defer func() {
if err != nil {
- s.hypervisor.stopSandbox()
+ s.hypervisor.stopSandbox(false)
}
}()
@@ -1090,7 +1093,12 @@ func (s *Sandbox) stopVM() error {
}
s.Logger().Info("Stopping VM")
- return s.hypervisor.stopSandbox()
+ forceStop := false
+ if s.state.State == types.StateUnhealthy {
+ forceStop = true
+ }
+
+ return s.hypervisor.stopSandbox(forceStop)
}
func (s *Sandbox) addContainer(c *Container) error {
@@ -1591,13 +1599,15 @@ func (s *Sandbox) setSandboxState(state types.StateString) error {
return vcTypes.ErrNeedState
}
+ s.Logger().Debugf("Setting sandbox state from %v to %v", s.state.State, state)
// update in-memory state
s.state.State = state
if useOldStore(s.ctx) {
return s.store.Store(store.State, s.state)
+ } else {
+ return s.Save()
}
- return nil
}
const maxBlockIndex = 65535
@@ -2207,3 +2217,79 @@ func (s *Sandbox) GetPatchedOCISpec() *specs.Spec {
return nil
}
+
+// health return current sandbox healthy or not
+// If qemu/kata-proxy/kata-agent process is abnormal,
+// s.agent.check() will return false
+func (s *Sandbox) health() bool {
+ err := s.agent.check()
+ if err != nil {
+ return false
+ }
+
+ return true
+}
+
+// shouldForceDelete force delete the sandbox when kata-proxy and hypervisor process exit
+// already and current process is kata-runtime kill or kata-runtime delete
+func (s *Sandbox) shouldForceDelete() bool {
+ cmdline, err := utils.GetProcessCmdline(os.Getpid())
+ if err != nil {
+ s.Logger().Errorf("fail to get process cmdline: %v", err)
+ return false
+ }
+
+ proxyPid := s.agent.getProxyPid()
+ hypervisorPids := s.hypervisor.getPids()
+ if len(hypervisorPids) <= 0 {
+ s.Logger().Warnf("get hypervisor main pid fail")
+ return false
+ }
+ hypervisorMainPid := hypervisorPids[0]
+ hypervisorPath := s.hypervisor.hypervisorConfig().HypervisorPath
+ hypervisorName := filepath.Base(hypervisorPath)
+
+ if !utils.IsProcessRunning(proxyPid, KataProxyProcessName, s.id) && !utils.IsProcessRunning(hypervisorMainPid, hypervisorName, s.id) &&
+ strings.Contains(cmdline, "delete") && strings.Contains(cmdline, "force") {
+ return true
+ }
+
+ return false
+}
+
+func (s *Sandbox) forceDeleteSandbox() {
+ for _, c := range s.containers {
+ // force delete all containers in the sandbox
+ c.forceDeleteContainer()
+ }
+
+ globalSandboxList.removeSandbox(s.id)
+
+ if s.monitor != nil {
+ s.monitor.stop()
+ }
+
+ if err := s.hypervisor.cleanup(); err != nil {
+ s.Logger().WithError(err).Error("failed to force cleanup hypervisor resource")
+ }
+
+ s.agent.cleanup(s)
+
+ if err := s.store.Delete(); err != nil {
+ s.Logger().WithError(err).Warn("sandbox force delete store failed")
+ }
+}
+
+func (s *Sandbox) setContainersState(state types.StateString) error {
+ if state == "" {
+ return vcTypes.ErrNeedState
+ }
+
+ for _, c := range s.containers {
+ if err := c.setContainerState(state); err != nil {
+ return err
+ }
+ }
+
+ return nil
+}
diff --git a/virtcontainers/types/sandbox.go b/virtcontainers/types/sandbox.go
index 3b64b20a..5d586b21 100644
--- a/virtcontainers/types/sandbox.go
+++ b/virtcontainers/types/sandbox.go
@@ -28,6 +28,9 @@ const (
// StateStopped represents a sandbox/container that has been stopped.
StateStopped StateString = "stopped"
+
+ // StateUnhealthy represents a sandbox/container that's in abnormal state.
+ StateUnhealthy StateString = "unhealthy"
)
const (
@@ -90,17 +93,17 @@ func (state *StateString) validTransition(oldState StateString, newState StateSt
switch *state {
case StateReady:
- if newState == StateRunning || newState == StateStopped {
+ if newState == StateRunning || newState == StateStopped || newState == StateUnhealthy {
return nil
}
case StateRunning:
- if newState == StatePaused || newState == StateStopped {
+ if newState == StatePaused || newState == StateStopped || newState == StateUnhealthy {
return nil
}
case StatePaused:
- if newState == StateRunning || newState == StateStopped {
+ if newState == StateRunning || newState == StateStopped || newState == StateUnhealthy {
return nil
}
@@ -108,6 +111,11 @@ func (state *StateString) validTransition(oldState StateString, newState StateSt
if newState == StateRunning {
return nil
}
+
+ case StateUnhealthy:
+ if newState == StateStopped {
+ return nil
+ }
}
return fmt.Errorf("Can not move from %v to %v",
diff --git a/virtcontainers/utils/utils.go b/virtcontainers/utils/utils.go
index 85c55489..2b555ebb 100644
--- a/virtcontainers/utils/utils.go
+++ b/virtcontainers/utils/utils.go
@@ -9,9 +9,13 @@ import (
"crypto/rand"
"errors"
"fmt"
+ "io/ioutil"
"os"
"os/exec"
"path/filepath"
+ "strconv"
+ "strings"
+ "syscall"
)
const cpBinaryName = "cp"
@@ -275,3 +279,45 @@ const (
MiB = KiB << 10
GiB = MiB << 10
)
+
+// Get process cmdline info by read /proc/<pid>/cmdline file
+func GetProcessCmdline(pid int) (cmdline string, err error) {
+ if pid <= 1 {
+ return "", fmt.Errorf("invalid pid number")
+ }
+
+ bytes, err := ioutil.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "cmdline"))
+ if err != nil {
+ return "", err
+ }
+
+ return string(bytes), nil
+}
+
+func IsProcessRunning(pid int, processName string, sandboxID string) bool {
+ if pid <= 0 {
+ return false
+ }
+
+ process, err := os.FindProcess(pid)
+ if err != nil {
+ return false
+ }
+
+ if err := process.Signal(syscall.Signal(0)); err != nil {
+ return false
+ }
+
+ cmdline, err := GetProcessCmdline(pid)
+ if err != nil {
+ return false
+ }
+
+ // If process's cmdline contains processName and sandboxID keyword,
+ // We think this process isn't be reused
+ if strings.Contains(cmdline, processName) && strings.Contains(cmdline, sandboxID) {
+ return true
+ }
+
+ return false
+}
diff --git a/virtcontainers/vm.go b/virtcontainers/vm.go
index fcda1e97..8d27b1fe 100644
--- a/virtcontainers/vm.go
+++ b/virtcontainers/vm.go
@@ -191,7 +191,7 @@ func NewVM(ctx context.Context, config VMConfig) (*VM, error) {
defer func() {
if err != nil {
virtLog.WithField("vm", id).WithError(err).Info("clean up vm")
- hypervisor.stopSandbox()
+ hypervisor.stopSandbox(false)
}
}()
@@ -333,7 +333,7 @@ func (v *VM) Disconnect() error {
func (v *VM) Stop() error {
v.logger().Info("stop vm")
- if err := v.hypervisor.stopSandbox(); err != nil {
+ if err := v.hypervisor.stopSandbox(false); err != nil {
return err
}
--
2.14.3 (Apple Git-98)