Fix #I4KI81 reason: modify kata-containers version and update it to 1.11.1 Signed-off-by: holyfei <yangfeiyu20092010@163.com>
856 lines
27 KiB
Diff
856 lines
27 KiB
Diff
From d93da1875ed7f1a6061cffb13475506d73c86003 Mon Sep 17 00:00:00 2001
|
|
From: jiangpengfei <jiangpengfei9@huawei.com>
|
|
Date: Sat, 25 Jul 2020 16:04:19 +0800
|
|
Subject: [PATCH 07/50] kata-runtime: enhance reliability when kata related
|
|
process
|
|
|
|
reason: enhance the reliability when kata related processes is abnormal,
|
|
make kata-container still destroy the sandbox and clean up all resources.
|
|
|
|
Signed-off-by: jiangpengfei <jiangpengfei9@huawei.com>
|
|
---
|
|
cli/delete.go | 6 ++
|
|
cli/kill.go | 3 +-
|
|
virtcontainers/acrn.go | 2 +-
|
|
virtcontainers/agent.go | 3 +
|
|
virtcontainers/api.go | 67 +++++++++++++++++++++
|
|
virtcontainers/clh.go | 2 +-
|
|
virtcontainers/container.go | 55 +++++++++++++++--
|
|
virtcontainers/fc.go | 2 +-
|
|
virtcontainers/hypervisor.go | 2 +-
|
|
virtcontainers/kata_agent.go | 31 ++++++----
|
|
virtcontainers/mock_hypervisor.go | 2 +-
|
|
virtcontainers/mock_hypervisor_test.go | 2 +-
|
|
virtcontainers/noop_agent.go | 4 ++
|
|
virtcontainers/pkg/oci/utils.go | 5 ++
|
|
virtcontainers/qemu.go | 51 +++++++++-------
|
|
virtcontainers/sandbox.go | 106 +++++++++++++++++++++++++++++----
|
|
virtcontainers/types/sandbox.go | 14 ++++-
|
|
virtcontainers/utils/utils.go | 46 ++++++++++++++
|
|
virtcontainers/vm.go | 4 +-
|
|
19 files changed, 348 insertions(+), 59 deletions(-)
|
|
|
|
diff --git a/cli/delete.go b/cli/delete.go
|
|
index c2ce52a4..2f5586e5 100644
|
|
--- a/cli/delete.go
|
|
+++ b/cli/delete.go
|
|
@@ -110,6 +110,12 @@ func delete(ctx context.Context, containerID string, force bool) error {
|
|
forceStop = true
|
|
}
|
|
|
|
+ if oci.StateToOCIState(status.State.State) == oci.StateUnhealthy {
|
|
+ // Set forceStop and force bool flag to true to force delete everything
|
|
+ forceStop = true
|
|
+ force = true
|
|
+ }
|
|
+
|
|
switch containerType {
|
|
case vc.PodSandbox:
|
|
if err := deleteSandbox(ctx, sandboxID, force); err != nil {
|
|
diff --git a/cli/kill.go b/cli/kill.go
|
|
index 60fa41e0..b228205f 100644
|
|
--- a/cli/kill.go
|
|
+++ b/cli/kill.go
|
|
@@ -133,11 +133,12 @@ func kill(ctx context.Context, containerID, signal string, all bool) error {
|
|
kataLog.WithField("signal", signal).WithField("container state", status.State.State).Info("kill")
|
|
|
|
// container MUST be created, running or paused
|
|
+ // If container state is unhealthy, should process this exceptional case separately
|
|
if status.State.State == types.StateReady || status.State.State == types.StateRunning || status.State.State == types.StatePaused {
|
|
if err := vci.KillContainer(ctx, sandboxID, containerID, signum, all); err != nil {
|
|
return err
|
|
}
|
|
- } else if !all {
|
|
+ } else if !all && status.State.State != types.StateUnhealthy {
|
|
return fmt.Errorf("container not running")
|
|
}
|
|
|
|
diff --git a/virtcontainers/acrn.go b/virtcontainers/acrn.go
|
|
index 761eda03..10cae06f 100644
|
|
--- a/virtcontainers/acrn.go
|
|
+++ b/virtcontainers/acrn.go
|
|
@@ -475,7 +475,7 @@ func (a *Acrn) waitSandbox(timeoutSecs int) error {
|
|
}
|
|
|
|
// stopSandbox will stop the Sandbox's VM.
|
|
-func (a *Acrn) stopSandbox() (err error) {
|
|
+func (a *Acrn) stopSandbox(force bool) (err error) {
|
|
span, _ := a.trace("stopSandbox")
|
|
defer span.Finish()
|
|
|
|
diff --git a/virtcontainers/agent.go b/virtcontainers/agent.go
|
|
index c62107ec..be9526c7 100644
|
|
--- a/virtcontainers/agent.go
|
|
+++ b/virtcontainers/agent.go
|
|
@@ -259,4 +259,7 @@ type agent interface {
|
|
|
|
// load data from disk
|
|
load(persistapi.AgentState)
|
|
+
|
|
+ // get proxy process pid
|
|
+ getProxyPid() int
|
|
}
|
|
diff --git a/virtcontainers/api.go b/virtcontainers/api.go
|
|
index de569713..fa82d163 100644
|
|
--- a/virtcontainers/api.go
|
|
+++ b/virtcontainers/api.go
|
|
@@ -7,8 +7,10 @@ package virtcontainers
|
|
|
|
import (
|
|
"context"
|
|
+ "fmt"
|
|
"os"
|
|
"runtime"
|
|
+ "strings"
|
|
"syscall"
|
|
|
|
deviceApi "github.com/kata-containers/runtime/virtcontainers/device/api"
|
|
@@ -18,6 +20,7 @@ import (
|
|
vcTypes "github.com/kata-containers/runtime/virtcontainers/pkg/types"
|
|
"github.com/kata-containers/runtime/virtcontainers/store"
|
|
"github.com/kata-containers/runtime/virtcontainers/types"
|
|
+ "github.com/kata-containers/runtime/virtcontainers/utils"
|
|
specs "github.com/opencontainers/runtime-spec/specs-go"
|
|
opentracing "github.com/opentracing/opentracing-go"
|
|
"github.com/sirupsen/logrus"
|
|
@@ -597,20 +600,51 @@ func statusContainer(sandbox *Sandbox, containerID string) (ContainerStatus, err
|
|
container.state.State == types.StatePaused) &&
|
|
container.process.Pid > 0 {
|
|
|
|
+ // If container state is active, however kata-proxy and qemu process all exit already
|
|
+ // which means sandbox has beed stopped exceptionally, then we should force delete
|
|
+ // sandbox and container state files in sandbox.Store
|
|
+ if sandbox.shouldForceDelete() {
|
|
+ virtLog.Logger.Warn("sandbox status is abnormal, sandbox should be force deleted")
|
|
+ sandbox.forceDeleteSandbox()
|
|
+ return ContainerStatus{}, fmt.Errorf("sandbox has beed stopped exceptionally")
|
|
+ }
|
|
+
|
|
running, err := isShimRunning(container.process.Pid)
|
|
if err != nil {
|
|
return ContainerStatus{}, err
|
|
}
|
|
|
|
+ // If kata-shim process exit or be killed, need to stop the container
|
|
if !running {
|
|
virtLog.WithFields(logrus.Fields{
|
|
"state": container.state.State,
|
|
"pid": container.process.Pid}).
|
|
Info("container isn't running")
|
|
+
|
|
if err := container.stop(true); err != nil {
|
|
return ContainerStatus{}, err
|
|
}
|
|
}
|
|
+
|
|
+ isPodSandbox := (containerID == sandbox.id)
|
|
+
|
|
+ // If sandbox is unhealthy, process it correctly
|
|
+ if !sandbox.health() {
|
|
+ // process podSandbox container type case
|
|
+ if isPodSandbox {
|
|
+ if err := processUnhealthySandbox(sandbox, container); err != nil {
|
|
+ return ContainerStatus{}, err
|
|
+ }
|
|
+ } else {
|
|
+ // If container type is pod_container, which means container operations can not be
|
|
+ // processed successfully, we should return the error as soon as possible
|
|
+ if err := container.setContainerState(types.StateUnhealthy); err != nil {
|
|
+ return ContainerStatus{}, err
|
|
+ }
|
|
+
|
|
+ return ContainerStatus{}, fmt.Errorf("container status is unhealthy, stop container failed")
|
|
+ }
|
|
+ }
|
|
}
|
|
|
|
return ContainerStatus{
|
|
@@ -1016,3 +1050,36 @@ func CleanupContainer(ctx context.Context, sandboxID, containerID string, force
|
|
|
|
return nil
|
|
}
|
|
+
|
|
+// procesUnhealthySandbox only change sandbox state to unhealthy
|
|
+// when caller is kata-runtime kill or kata-runtime delete
|
|
+func processUnhealthySandbox(sandbox *Sandbox, container *Container) error {
|
|
+ // Set all containers state to unhealthy
|
|
+ if err := sandbox.setContainersState(types.StateUnhealthy); err != nil {
|
|
+ container.Logger().WithError(err).Warn("set all containers state to unhealthy fail")
|
|
+ }
|
|
+
|
|
+ // Set sandbox state to unhealthy
|
|
+ if err := sandbox.setSandboxState(types.StateUnhealthy); err != nil {
|
|
+ container.Logger().WithError(err).Warn("set sandbox state to unhealthy fail")
|
|
+ }
|
|
+
|
|
+ forceDelete := false
|
|
+
|
|
+ // If process is kata-runtime kill or kata-runtime delete,
|
|
+ // we should kill or delete sandbox forcefully
|
|
+ if cmdline, err := utils.GetProcessCmdline(os.Getpid()); err != nil {
|
|
+ container.Logger().WithError(err).Warn("fail to get process cmdline info")
|
|
+ } else {
|
|
+ forceDelete = strings.Contains(cmdline, "kill") || strings.Contains(cmdline, "delete")
|
|
+ }
|
|
+
|
|
+ if forceDelete {
|
|
+ // force stop podSandbox type container's kata-shim process
|
|
+ if err := stopShim(container.process.Pid); err != nil {
|
|
+ container.Logger().WithError(err).Warn("fail to stop podSandbox type container kata-shim")
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return nil
|
|
+}
|
|
diff --git a/virtcontainers/clh.go b/virtcontainers/clh.go
|
|
index d40b698b..59510b02 100644
|
|
--- a/virtcontainers/clh.go
|
|
+++ b/virtcontainers/clh.go
|
|
@@ -569,7 +569,7 @@ func (clh *cloudHypervisor) resumeSandbox() error {
|
|
}
|
|
|
|
// stopSandbox will stop the Sandbox's VM.
|
|
-func (clh *cloudHypervisor) stopSandbox() (err error) {
|
|
+func (clh *cloudHypervisor) stopSandbox(force bool) (err error) {
|
|
span, _ := clh.trace("stopSandbox")
|
|
defer span.Finish()
|
|
clh.Logger().WithField("function", "stopSandbox").Info("Stop Sandbox")
|
|
diff --git a/virtcontainers/container.go b/virtcontainers/container.go
|
|
index b42cc6e9..9485e708 100644
|
|
--- a/virtcontainers/container.go
|
|
+++ b/virtcontainers/container.go
|
|
@@ -17,8 +17,12 @@ import (
|
|
"time"
|
|
|
|
"github.com/containerd/cgroups"
|
|
+ "github.com/kata-containers/runtime/virtcontainers/device/config"
|
|
+ "github.com/kata-containers/runtime/virtcontainers/device/manager"
|
|
vccgroups "github.com/kata-containers/runtime/virtcontainers/pkg/cgroups"
|
|
+ "github.com/kata-containers/runtime/virtcontainers/pkg/rootless"
|
|
vcTypes "github.com/kata-containers/runtime/virtcontainers/pkg/types"
|
|
+ "github.com/kata-containers/runtime/virtcontainers/store"
|
|
"github.com/kata-containers/runtime/virtcontainers/types"
|
|
"github.com/kata-containers/runtime/virtcontainers/utils"
|
|
specs "github.com/opencontainers/runtime-spec/specs-go"
|
|
@@ -26,11 +30,6 @@ import (
|
|
"github.com/pkg/errors"
|
|
"github.com/sirupsen/logrus"
|
|
"golang.org/x/sys/unix"
|
|
-
|
|
- "github.com/kata-containers/runtime/virtcontainers/device/config"
|
|
- "github.com/kata-containers/runtime/virtcontainers/device/manager"
|
|
- "github.com/kata-containers/runtime/virtcontainers/pkg/rootless"
|
|
- "github.com/kata-containers/runtime/virtcontainers/store"
|
|
)
|
|
|
|
// https://github.com/torvalds/linux/blob/master/include/uapi/linux/major.h
|
|
@@ -1047,6 +1046,13 @@ func (c *Container) stop(force bool) error {
|
|
return nil
|
|
}
|
|
|
|
+ // If container state is unhealthy, just force kill the container
|
|
+ if c.state.State == types.StateUnhealthy {
|
|
+ c.forceKillContainer()
|
|
+ // after force kill container, then change container state to stopped
|
|
+ return c.setContainerState(types.StateStopped)
|
|
+ }
|
|
+
|
|
if err := c.state.ValidTransition(c.state.State, types.StateStopped); err != nil {
|
|
return err
|
|
}
|
|
@@ -1063,6 +1069,8 @@ func (c *Container) stop(force bool) error {
|
|
if err := stopShim(c.process.Pid); err != nil {
|
|
l.WithError(err).Warn("failed to stop shim")
|
|
}
|
|
+
|
|
+ c.forceKillContainer()
|
|
}
|
|
|
|
}()
|
|
@@ -1096,7 +1104,9 @@ func (c *Container) stop(force bool) error {
|
|
// this signal will ensure the container will get killed to match
|
|
// the state of the shim. This will allow the following call to
|
|
// stopContainer() to succeed in such particular case.
|
|
- c.kill(syscall.SIGKILL, true)
|
|
+ if err := c.kill(syscall.SIGKILL, true); err != nil {
|
|
+ c.Logger().Errorf("send signal to container failed: %v", err)
|
|
+ }
|
|
|
|
// Since the agent has supported the MultiWaitProcess, it's better to
|
|
// wait the process here to make sure the process has exited before to
|
|
@@ -1582,3 +1592,36 @@ func (c *Container) cgroupsUpdate(resources specs.LinuxResources) error {
|
|
|
|
return nil
|
|
}
|
|
+
|
|
+// forceDeleteContainer force clean container mount info and resources stored in the disk
|
|
+func (c *Container) forceDeleteContainer() {
|
|
+ if err := c.unmountHostMounts(); err != nil {
|
|
+ c.Logger().WithError(err).Warn("container force delete umount host mounts fail")
|
|
+ }
|
|
+
|
|
+ if err := c.sandbox.removeContainer(c.id); err != nil {
|
|
+ c.Logger().WithError(err).Warn("sandbox removeContainer fail")
|
|
+ }
|
|
+
|
|
+ if err := c.store.Delete(); err != nil {
|
|
+ c.Logger().WithError(err).Warn("force delete container store fail")
|
|
+ }
|
|
+}
|
|
+
|
|
+func (c *Container) forceKillContainer() {
|
|
+ if err := c.setContainerState(types.StateStopped); err != nil {
|
|
+ c.Logger().WithError(err).Warn("force kill container: change container state to StateStopped failed")
|
|
+ }
|
|
+
|
|
+ if err := c.unmountHostMounts(); err != nil {
|
|
+ c.Logger().WithError(err).Warn("force kill container: umount container host mounts failed")
|
|
+ }
|
|
+
|
|
+ if err := c.detachDevices(); err != nil {
|
|
+ c.Logger().WithError(err).Warn("force kill container: detach container devices failed")
|
|
+ }
|
|
+
|
|
+ if err := c.removeDrive(); err != nil {
|
|
+ c.Logger().WithError(err).Warn("force kill container: remove container drive failed")
|
|
+ }
|
|
+}
|
|
diff --git a/virtcontainers/fc.go b/virtcontainers/fc.go
|
|
index 97ef5ffc..72a8e192 100644
|
|
--- a/virtcontainers/fc.go
|
|
+++ b/virtcontainers/fc.go
|
|
@@ -864,7 +864,7 @@ func (fc *firecracker) cleanupJail() {
|
|
}
|
|
|
|
// stopSandbox will stop the Sandbox's VM.
|
|
-func (fc *firecracker) stopSandbox() (err error) {
|
|
+func (fc *firecracker) stopSandbox(force bool) (err error) {
|
|
span, _ := fc.trace("stopSandbox")
|
|
defer span.Finish()
|
|
|
|
diff --git a/virtcontainers/hypervisor.go b/virtcontainers/hypervisor.go
|
|
index 4b3dd3d0..fd7d1f8e 100644
|
|
--- a/virtcontainers/hypervisor.go
|
|
+++ b/virtcontainers/hypervisor.go
|
|
@@ -766,7 +766,7 @@ func generateVMSocket(id string, useVsock bool, vmStogarePath string) (interface
|
|
type hypervisor interface {
|
|
createSandbox(ctx context.Context, id string, networkNS NetworkNamespace, hypervisorConfig *HypervisorConfig, stateful bool) error
|
|
startSandbox(timeout int) error
|
|
- stopSandbox() error
|
|
+ stopSandbox(force bool) error
|
|
pauseSandbox() error
|
|
saveSandbox() error
|
|
resumeSandbox() error
|
|
diff --git a/virtcontainers/kata_agent.go b/virtcontainers/kata_agent.go
|
|
index be5e96aa..7575d326 100644
|
|
--- a/virtcontainers/kata_agent.go
|
|
+++ b/virtcontainers/kata_agent.go
|
|
@@ -57,8 +57,9 @@ const (
|
|
)
|
|
|
|
var (
|
|
- checkRequestTimeout = 30 * time.Second
|
|
- defaultRequestTimeout = 60 * time.Second
|
|
+ checkRequestTimeout = 10 * time.Second
|
|
+ defaultRequestTimeout = 10 * time.Second
|
|
+ createContainerTimeout = 120 * time.Second
|
|
errorMissingProxy = errors.New("Missing proxy pointer")
|
|
errorMissingOCISpec = errors.New("Missing OCI specification")
|
|
defaultKataHostSharedDir = "/run/kata-containers/shared/sandboxes/"
|
|
@@ -987,17 +988,21 @@ func (k *kataAgent) stopSandbox(sandbox *Sandbox) error {
|
|
k.state.URL = ""
|
|
}()
|
|
|
|
- req := &grpc.DestroySandboxRequest{}
|
|
+ // If sandbox.state.State is unhealthy, we don't need to send DestroySandboxRequest
|
|
+ // to kata-agent, just force stop the sandbox
|
|
+ if sandbox.state.State != types.StateUnhealthy {
|
|
+ req := &grpc.DestroySandboxRequest{}
|
|
|
|
- if _, err := k.sendReq(req); err != nil {
|
|
- return err
|
|
- }
|
|
-
|
|
- if k.dynamicTracing {
|
|
- _, err := k.sendReq(&grpc.StopTracingRequest{})
|
|
- if err != nil {
|
|
+ if _, err := k.sendReq(req); err != nil {
|
|
return err
|
|
}
|
|
+
|
|
+ if k.dynamicTracing {
|
|
+ _, err := k.sendReq(&grpc.StopTracingRequest{})
|
|
+ if err != nil {
|
|
+ return err
|
|
+ }
|
|
+ }
|
|
}
|
|
|
|
return nil
|
|
@@ -2062,6 +2067,8 @@ func (k *kataAgent) getReqContext(reqName string) (ctx context.Context, cancel c
|
|
// Wait has no timeout
|
|
case grpcCheckRequest:
|
|
ctx, cancel = context.WithTimeout(ctx, checkRequestTimeout)
|
|
+ case grpcCreateContainerRequest:
|
|
+ ctx, cancel = context.WithTimeout(ctx, createContainerTimeout)
|
|
default:
|
|
ctx, cancel = context.WithTimeout(ctx, defaultRequestTimeout)
|
|
}
|
|
@@ -2382,3 +2389,7 @@ func (k *kataAgent) load(s persistapi.AgentState) {
|
|
k.state.ProxyPid = s.ProxyPid
|
|
k.state.URL = s.URL
|
|
}
|
|
+
|
|
+func (k *kataAgent) getProxyPid() int {
|
|
+ return k.state.ProxyPid
|
|
+}
|
|
diff --git a/virtcontainers/mock_hypervisor.go b/virtcontainers/mock_hypervisor.go
|
|
index 0c84e43c..a5b67491 100644
|
|
--- a/virtcontainers/mock_hypervisor.go
|
|
+++ b/virtcontainers/mock_hypervisor.go
|
|
@@ -39,7 +39,7 @@ func (m *mockHypervisor) startSandbox(timeout int) error {
|
|
return nil
|
|
}
|
|
|
|
-func (m *mockHypervisor) stopSandbox() error {
|
|
+func (m *mockHypervisor) stopSandbox(force bool) error {
|
|
return nil
|
|
}
|
|
|
|
diff --git a/virtcontainers/mock_hypervisor_test.go b/virtcontainers/mock_hypervisor_test.go
|
|
index b73b28f2..827e3192 100644
|
|
--- a/virtcontainers/mock_hypervisor_test.go
|
|
+++ b/virtcontainers/mock_hypervisor_test.go
|
|
@@ -53,7 +53,7 @@ func TestMockHypervisorStartSandbox(t *testing.T) {
|
|
func TestMockHypervisorStopSandbox(t *testing.T) {
|
|
var m *mockHypervisor
|
|
|
|
- assert.NoError(t, m.stopSandbox())
|
|
+ assert.NoError(t, m.stopSandbox(false))
|
|
}
|
|
|
|
func TestMockHypervisorAddDevice(t *testing.T) {
|
|
diff --git a/virtcontainers/noop_agent.go b/virtcontainers/noop_agent.go
|
|
index 189f6b3f..8a7cd337 100644
|
|
--- a/virtcontainers/noop_agent.go
|
|
+++ b/virtcontainers/noop_agent.go
|
|
@@ -236,3 +236,7 @@ func (n *noopAgent) save() (s persistapi.AgentState) {
|
|
|
|
// load is the Noop agent state loader. It does nothing.
|
|
func (n *noopAgent) load(s persistapi.AgentState) {}
|
|
+
|
|
+func (n *noopAgent) getProxyPid() int {
|
|
+ return -1
|
|
+}
|
|
\ No newline at end of file
|
|
diff --git a/virtcontainers/pkg/oci/utils.go b/virtcontainers/pkg/oci/utils.go
|
|
index 5348c57d..cd8d48ce 100644
|
|
--- a/virtcontainers/pkg/oci/utils.go
|
|
+++ b/virtcontainers/pkg/oci/utils.go
|
|
@@ -70,6 +70,9 @@ const (
|
|
|
|
// StatePaused represents a container that has been paused.
|
|
StatePaused = "paused"
|
|
+
|
|
+ // StateUnhealthy represents a container that is unhealthy
|
|
+ StateUnhealthy = "unhealthy"
|
|
)
|
|
|
|
const KernelModulesSeparator = ";"
|
|
@@ -964,6 +967,8 @@ func StateToOCIState(state types.StateString) string {
|
|
return StateStopped
|
|
case types.StatePaused:
|
|
return StatePaused
|
|
+ case types.StateUnhealthy:
|
|
+ return StateUnhealthy
|
|
default:
|
|
return ""
|
|
}
|
|
diff --git a/virtcontainers/qemu.go b/virtcontainers/qemu.go
|
|
index ca286550..4b15d968 100644
|
|
--- a/virtcontainers/qemu.go
|
|
+++ b/virtcontainers/qemu.go
|
|
@@ -687,7 +687,7 @@ func (q *qemu) setupVirtiofsd() (err error) {
|
|
q.Logger().Info("virtiofsd quits")
|
|
// Wait to release resources of virtiofsd process
|
|
cmd.Process.Wait()
|
|
- q.stopSandbox()
|
|
+ q.stopSandbox(false)
|
|
}()
|
|
return err
|
|
}
|
|
@@ -922,11 +922,11 @@ func (q *qemu) waitSandbox(timeout int) error {
|
|
}
|
|
|
|
// stopSandbox will stop the Sandbox's VM.
|
|
-func (q *qemu) stopSandbox() error {
|
|
+func (q *qemu) stopSandbox(force bool) error {
|
|
span, _ := q.trace("stopSandbox")
|
|
defer span.Finish()
|
|
|
|
- q.Logger().Info("Stopping Sandbox")
|
|
+ q.Logger().Infof("force stopping Sandbox: %v", force)
|
|
if q.stopped {
|
|
q.Logger().Info("Already stopped")
|
|
return nil
|
|
@@ -937,28 +937,37 @@ func (q *qemu) stopSandbox() error {
|
|
q.stopped = true
|
|
}()
|
|
|
|
- if q.config.Debug && q.qemuConfig.LogFile != "" {
|
|
- f, err := os.OpenFile(q.qemuConfig.LogFile, os.O_RDONLY, 0)
|
|
- if err == nil {
|
|
- scanner := bufio.NewScanner(f)
|
|
- for scanner.Scan() {
|
|
- q.Logger().Debug(scanner.Text())
|
|
- }
|
|
- if err := scanner.Err(); err != nil {
|
|
- q.Logger().WithError(err).Debug("read qemu log failed")
|
|
+ if !force {
|
|
+ if q.config.Debug && q.qemuConfig.LogFile != "" {
|
|
+ f, err := os.OpenFile(q.qemuConfig.LogFile, os.O_RDONLY, 0)
|
|
+ if err == nil {
|
|
+ scanner := bufio.NewScanner(f)
|
|
+ for scanner.Scan() {
|
|
+ q.Logger().Debug(scanner.Text())
|
|
+ }
|
|
+ if err := scanner.Err(); err != nil {
|
|
+ q.Logger().WithError(err).Debug("read qemu log failed")
|
|
+ }
|
|
}
|
|
}
|
|
- }
|
|
|
|
- err := q.qmpSetup()
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
+ err := q.qmpSetup()
|
|
+ if err != nil {
|
|
+ return err
|
|
+ }
|
|
|
|
- err = q.qmpMonitorCh.qmp.ExecuteQuit(q.qmpMonitorCh.ctx)
|
|
- if err != nil {
|
|
- q.Logger().WithError(err).Error("Fail to execute qmp QUIT")
|
|
- return err
|
|
+ err = q.qmpMonitorCh.qmp.ExecuteQuit(q.qmpMonitorCh.ctx)
|
|
+ if err != nil {
|
|
+ q.Logger().WithError(err).Error("Fail to execute qmp QUIT")
|
|
+ return err
|
|
+ }
|
|
+ } else {
|
|
+ qemuMainPid := q.getPids()[0]
|
|
+ if qemuMainPid <= 1 {
|
|
+ return fmt.Errorf("force kill qemu process pid is invalid")
|
|
+ }
|
|
+
|
|
+ _ = syscall.Kill(qemuMainPid, syscall.SIGKILL)
|
|
}
|
|
|
|
return nil
|
|
diff --git a/virtcontainers/sandbox.go b/virtcontainers/sandbox.go
|
|
index edd1af5b..78188ed7 100644
|
|
--- a/virtcontainers/sandbox.go
|
|
+++ b/virtcontainers/sandbox.go
|
|
@@ -12,19 +12,13 @@ import (
|
|
"math"
|
|
"net"
|
|
"os"
|
|
+ "path/filepath"
|
|
"strings"
|
|
"sync"
|
|
"syscall"
|
|
|
|
"github.com/containerd/cgroups"
|
|
"github.com/containernetworking/plugins/pkg/ns"
|
|
- "github.com/opencontainers/runc/libcontainer/configs"
|
|
- specs "github.com/opencontainers/runtime-spec/specs-go"
|
|
- opentracing "github.com/opentracing/opentracing-go"
|
|
- "github.com/pkg/errors"
|
|
- "github.com/sirupsen/logrus"
|
|
- "github.com/vishvananda/netlink"
|
|
-
|
|
"github.com/kata-containers/agent/protocols/grpc"
|
|
"github.com/kata-containers/runtime/virtcontainers/device/api"
|
|
"github.com/kata-containers/runtime/virtcontainers/device/config"
|
|
@@ -41,6 +35,12 @@ import (
|
|
"github.com/kata-containers/runtime/virtcontainers/store"
|
|
"github.com/kata-containers/runtime/virtcontainers/types"
|
|
"github.com/kata-containers/runtime/virtcontainers/utils"
|
|
+ "github.com/opencontainers/runc/libcontainer/configs"
|
|
+ specs "github.com/opencontainers/runtime-spec/specs-go"
|
|
+ opentracing "github.com/opentracing/opentracing-go"
|
|
+ "github.com/pkg/errors"
|
|
+ "github.com/sirupsen/logrus"
|
|
+ "github.com/vishvananda/netlink"
|
|
)
|
|
|
|
const (
|
|
@@ -50,6 +50,9 @@ const (
|
|
|
|
// DirMode is the permission bits used for creating a directory
|
|
DirMode = os.FileMode(0750) | os.ModeDir
|
|
+
|
|
+ // kata-proxy proces name
|
|
+ KataProxyProcessName = "kata-proxy"
|
|
)
|
|
|
|
// SandboxStatus describes a sandbox status.
|
|
@@ -1037,7 +1040,7 @@ func (s *Sandbox) startVM() (err error) {
|
|
|
|
defer func() {
|
|
if err != nil {
|
|
- s.hypervisor.stopSandbox()
|
|
+ s.hypervisor.stopSandbox(false)
|
|
}
|
|
}()
|
|
|
|
@@ -1090,7 +1093,12 @@ func (s *Sandbox) stopVM() error {
|
|
}
|
|
|
|
s.Logger().Info("Stopping VM")
|
|
- return s.hypervisor.stopSandbox()
|
|
+ forceStop := false
|
|
+ if s.state.State == types.StateUnhealthy {
|
|
+ forceStop = true
|
|
+ }
|
|
+
|
|
+ return s.hypervisor.stopSandbox(forceStop)
|
|
}
|
|
|
|
func (s *Sandbox) addContainer(c *Container) error {
|
|
@@ -1591,13 +1599,15 @@ func (s *Sandbox) setSandboxState(state types.StateString) error {
|
|
return vcTypes.ErrNeedState
|
|
}
|
|
|
|
+ s.Logger().Debugf("Setting sandbox state from %v to %v", s.state.State, state)
|
|
// update in-memory state
|
|
s.state.State = state
|
|
|
|
if useOldStore(s.ctx) {
|
|
return s.store.Store(store.State, s.state)
|
|
+ } else {
|
|
+ return s.Save()
|
|
}
|
|
- return nil
|
|
}
|
|
|
|
const maxBlockIndex = 65535
|
|
@@ -2207,3 +2217,79 @@ func (s *Sandbox) GetPatchedOCISpec() *specs.Spec {
|
|
|
|
return nil
|
|
}
|
|
+
|
|
+// health return current sandbox healthy or not
|
|
+// If qemu/kata-proxy/kata-agent process is abnormal,
|
|
+// s.agent.check() will return false
|
|
+func (s *Sandbox) health() bool {
|
|
+ err := s.agent.check()
|
|
+ if err != nil {
|
|
+ return false
|
|
+ }
|
|
+
|
|
+ return true
|
|
+}
|
|
+
|
|
+// shouldForceDelete force delete the sandbox when kata-proxy and hypervisor process exit
|
|
+// already and current process is kata-runtime kill or kata-runtime delete
|
|
+func (s *Sandbox) shouldForceDelete() bool {
|
|
+ cmdline, err := utils.GetProcessCmdline(os.Getpid())
|
|
+ if err != nil {
|
|
+ s.Logger().Errorf("fail to get process cmdline: %v", err)
|
|
+ return false
|
|
+ }
|
|
+
|
|
+ proxyPid := s.agent.getProxyPid()
|
|
+ hypervisorPids := s.hypervisor.getPids()
|
|
+ if len(hypervisorPids) <= 0 {
|
|
+ s.Logger().Warnf("get hypervisor main pid fail")
|
|
+ return false
|
|
+ }
|
|
+ hypervisorMainPid := hypervisorPids[0]
|
|
+ hypervisorPath := s.hypervisor.hypervisorConfig().HypervisorPath
|
|
+ hypervisorName := filepath.Base(hypervisorPath)
|
|
+
|
|
+ if !utils.IsProcessRunning(proxyPid, KataProxyProcessName, s.id) && !utils.IsProcessRunning(hypervisorMainPid, hypervisorName, s.id) &&
|
|
+ strings.Contains(cmdline, "delete") && strings.Contains(cmdline, "force") {
|
|
+ return true
|
|
+ }
|
|
+
|
|
+ return false
|
|
+}
|
|
+
|
|
+func (s *Sandbox) forceDeleteSandbox() {
|
|
+ for _, c := range s.containers {
|
|
+ // force delete all containers in the sandbox
|
|
+ c.forceDeleteContainer()
|
|
+ }
|
|
+
|
|
+ globalSandboxList.removeSandbox(s.id)
|
|
+
|
|
+ if s.monitor != nil {
|
|
+ s.monitor.stop()
|
|
+ }
|
|
+
|
|
+ if err := s.hypervisor.cleanup(); err != nil {
|
|
+ s.Logger().WithError(err).Error("failed to force cleanup hypervisor resource")
|
|
+ }
|
|
+
|
|
+ s.agent.cleanup(s)
|
|
+
|
|
+ if err := s.store.Delete(); err != nil {
|
|
+ s.Logger().WithError(err).Warn("sandbox force delete store failed")
|
|
+ }
|
|
+}
|
|
+
|
|
+func (s *Sandbox) setContainersState(state types.StateString) error {
|
|
+ if state == "" {
|
|
+ return vcTypes.ErrNeedState
|
|
+ }
|
|
+
|
|
+ for _, c := range s.containers {
|
|
+ if err := c.setContainerState(state); err != nil {
|
|
+ return err
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return nil
|
|
+}
|
|
diff --git a/virtcontainers/types/sandbox.go b/virtcontainers/types/sandbox.go
|
|
index 3b64b20a..5d586b21 100644
|
|
--- a/virtcontainers/types/sandbox.go
|
|
+++ b/virtcontainers/types/sandbox.go
|
|
@@ -28,6 +28,9 @@ const (
|
|
|
|
// StateStopped represents a sandbox/container that has been stopped.
|
|
StateStopped StateString = "stopped"
|
|
+
|
|
+ // StateUnhealthy represents a sandbox/container that's in abnormal state.
|
|
+ StateUnhealthy StateString = "unhealthy"
|
|
)
|
|
|
|
const (
|
|
@@ -90,17 +93,17 @@ func (state *StateString) validTransition(oldState StateString, newState StateSt
|
|
|
|
switch *state {
|
|
case StateReady:
|
|
- if newState == StateRunning || newState == StateStopped {
|
|
+ if newState == StateRunning || newState == StateStopped || newState == StateUnhealthy {
|
|
return nil
|
|
}
|
|
|
|
case StateRunning:
|
|
- if newState == StatePaused || newState == StateStopped {
|
|
+ if newState == StatePaused || newState == StateStopped || newState == StateUnhealthy {
|
|
return nil
|
|
}
|
|
|
|
case StatePaused:
|
|
- if newState == StateRunning || newState == StateStopped {
|
|
+ if newState == StateRunning || newState == StateStopped || newState == StateUnhealthy {
|
|
return nil
|
|
}
|
|
|
|
@@ -108,6 +111,11 @@ func (state *StateString) validTransition(oldState StateString, newState StateSt
|
|
if newState == StateRunning {
|
|
return nil
|
|
}
|
|
+
|
|
+ case StateUnhealthy:
|
|
+ if newState == StateStopped {
|
|
+ return nil
|
|
+ }
|
|
}
|
|
|
|
return fmt.Errorf("Can not move from %v to %v",
|
|
diff --git a/virtcontainers/utils/utils.go b/virtcontainers/utils/utils.go
|
|
index 85c55489..2b555ebb 100644
|
|
--- a/virtcontainers/utils/utils.go
|
|
+++ b/virtcontainers/utils/utils.go
|
|
@@ -9,9 +9,13 @@ import (
|
|
"crypto/rand"
|
|
"errors"
|
|
"fmt"
|
|
+ "io/ioutil"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
+ "strconv"
|
|
+ "strings"
|
|
+ "syscall"
|
|
)
|
|
|
|
const cpBinaryName = "cp"
|
|
@@ -275,3 +279,45 @@ const (
|
|
MiB = KiB << 10
|
|
GiB = MiB << 10
|
|
)
|
|
+
|
|
+// Get process cmdline info by read /proc/<pid>/cmdline file
|
|
+func GetProcessCmdline(pid int) (cmdline string, err error) {
|
|
+ if pid <= 1 {
|
|
+ return "", fmt.Errorf("invalid pid number")
|
|
+ }
|
|
+
|
|
+ bytes, err := ioutil.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "cmdline"))
|
|
+ if err != nil {
|
|
+ return "", err
|
|
+ }
|
|
+
|
|
+ return string(bytes), nil
|
|
+}
|
|
+
|
|
+func IsProcessRunning(pid int, processName string, sandboxID string) bool {
|
|
+ if pid <= 0 {
|
|
+ return false
|
|
+ }
|
|
+
|
|
+ process, err := os.FindProcess(pid)
|
|
+ if err != nil {
|
|
+ return false
|
|
+ }
|
|
+
|
|
+ if err := process.Signal(syscall.Signal(0)); err != nil {
|
|
+ return false
|
|
+ }
|
|
+
|
|
+ cmdline, err := GetProcessCmdline(pid)
|
|
+ if err != nil {
|
|
+ return false
|
|
+ }
|
|
+
|
|
+ // If process's cmdline contains processName and sandboxID keyword,
|
|
+ // We think this process isn't be reused
|
|
+ if strings.Contains(cmdline, processName) && strings.Contains(cmdline, sandboxID) {
|
|
+ return true
|
|
+ }
|
|
+
|
|
+ return false
|
|
+}
|
|
diff --git a/virtcontainers/vm.go b/virtcontainers/vm.go
|
|
index fcda1e97..8d27b1fe 100644
|
|
--- a/virtcontainers/vm.go
|
|
+++ b/virtcontainers/vm.go
|
|
@@ -191,7 +191,7 @@ func NewVM(ctx context.Context, config VMConfig) (*VM, error) {
|
|
defer func() {
|
|
if err != nil {
|
|
virtLog.WithField("vm", id).WithError(err).Info("clean up vm")
|
|
- hypervisor.stopSandbox()
|
|
+ hypervisor.stopSandbox(false)
|
|
}
|
|
}()
|
|
|
|
@@ -333,7 +333,7 @@ func (v *VM) Disconnect() error {
|
|
func (v *VM) Stop() error {
|
|
v.logger().Info("stop vm")
|
|
|
|
- if err := v.hypervisor.stopSandbox(); err != nil {
|
|
+ if err := v.hypervisor.stopSandbox(false); err != nil {
|
|
return err
|
|
}
|
|
|
|
--
|
|
2.14.3 (Apple Git-98)
|
|
|