From ce7523dfe1bb60cf54254e16a103fd3fc9503618 Mon Sep 17 00:00:00 2001 From: yangfeiyu Date: Thu, 17 Sep 2020 10:38:38 +0800 Subject: [PATCH 3/5] kata_runtime: support host cgroup with emulator policy reason: support host cgroup with emulator policy when sandbox_cgroup_with_emulator is set true Signed-off-by: yangfeiyu --- cli/create.go | 38 ++++++++++++ virtcontainers/api.go | 10 ++- virtcontainers/cgroups.go | 132 ++++++++++++++++++++++++++++++++++------ virtcontainers/persist/fs/fs.go | 8 +++ virtcontainers/pkg/oci/utils.go | 14 +++++ virtcontainers/sandbox.go | 70 ++++++++++++++++++++- 6 files changed, 250 insertions(+), 22 deletions(-) diff --git a/cli/create.go b/cli/create.go index 02cb2c5..b14434b 100644 --- a/cli/create.go +++ b/cli/create.go @@ -11,6 +11,7 @@ import ( "errors" "fmt" "os" + "path/filepath" "github.com/kata-containers/runtime/pkg/katautils" vc "github.com/kata-containers/runtime/virtcontainers" @@ -134,11 +135,48 @@ func create(ctx context.Context, containerID, bundlePath, console, pidFilePath s var process vc.Process switch containerType { case vc.PodSandbox: + if runtimeConfig.SandboxCgroupWithEmulator { + // create the sandbox level cgroup + cgroupPath := ociSpec.Linux.CgroupsPath + if err = vci.CreateSandboxCgroup(ctx, cgroupPath); err != nil { + return err + } + + defer func() { + if err != nil { + _ = vci.DestroySandboxCgroup(ctx, cgroupPath) + } + }() + + // add kata-runtime create process into /vcpu cgroup + vcpuCgroupPath := filepath.Join(cgroupPath, "vcpu") + if err = vci.AddPidToSandboxCgroup(ctx, os.Getpid(), vcpuCgroupPath); err != nil { + return err + } + } + _, process, err = katautils.CreateSandbox(ctx, vci, ociSpec, runtimeConfig, rootFs, containerID, bundlePath, console, disableOutput, systemdCgroup, false) if err != nil { return err } case vc.PodContainer: + if runtimeConfig.SandboxCgroupWithEmulator { + sandboxID, err := oci.GetSandboxIDFromAnnotations(&ociSpec) + if err != nil { + return fmt.Errorf("container annotation doesn't contain sandboxID") + } + + sandboxCgroupPath, err := vci.GetSandboxCgroupPath(ctx, sandboxID) + if err != nil { + return err + } + + // add kata-runtime create process into /vcpu cgroup + vcpuCgroupPath := filepath.Join(sandboxCgroupPath, "vcpu") + if err = vci.AddPidToSandboxCgroup(ctx, os.Getpid(), vcpuCgroupPath); err != nil { + return err + } + } process, err = katautils.CreateContainer(ctx, vci, nil, ociSpec, rootFs, containerID, bundlePath, console, disableOutput, false) if err != nil { return err diff --git a/virtcontainers/api.go b/virtcontainers/api.go index 08bcbb5..38c8235 100644 --- a/virtcontainers/api.go +++ b/virtcontainers/api.go @@ -103,9 +103,7 @@ func createSandboxFromConfig(ctx context.Context, sandboxConfig SandboxConfig, f }() // Move runtime to sandbox cgroup so all process are created there. - if s.config.SandboxCgroupWithEmulator{ - // emulator - } else if s.config.SandboxCgroupOnly { + if !s.config.SandboxCgroupWithEmulator && s.config.SandboxCgroupOnly { if err := s.setupSandboxCgroup(); err != nil { return nil, err } @@ -129,6 +127,12 @@ func createSandboxFromConfig(ctx context.Context, sandboxConfig SandboxConfig, f return nil, err } + if s.config.SandboxCgroupWithEmulator { + if err := s.setupHostCgroupsWithEmulator(); err != nil { + return nil, err + } + } + // Create Containers if err = s.createContainers(); err != nil { return nil, err diff --git a/virtcontainers/cgroups.go b/virtcontainers/cgroups.go index df0ec30..65d2001 100644 --- a/virtcontainers/cgroups.go +++ b/virtcontainers/cgroups.go @@ -9,19 +9,15 @@ package virtcontainers import ( "bufio" "context" - "encoding/json" "fmt" - "io/ioutil" "os" "path/filepath" + "strconv" "strings" "github.com/containerd/cgroups" specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" - - "github.com/kata-containers/runtime/virtcontainers/store" - "github.com/kata-containers/runtime/virtcontainers/types" ) type cgroupPather interface { @@ -32,7 +28,11 @@ type cgroupPather interface { // unconstrained cgroups are placed here. // for example /sys/fs/cgroup/memory/kata/$CGPATH // where path is defined by the containers manager -const cgroupKataPath = "/kata/" +const ( + cgroupKataPath = "/kata/" + vcpuCgroupName = "vcpu" + emulatorCgroupName = "emulator" +) var cgroupsLoadFunc = cgroups.Load var cgroupsNewFunc = cgroups.New @@ -105,24 +105,16 @@ func deleteCgroup(hierarchy cgroups.Hierarchy, cgroupPath string) error { // GetSandboxCgroupPath return the cgroup path of specified sandbox func GetSandboxCgroupPath(ctx context.Context, sandboxID string) (string, error) { - stateFilePath := filepath.Join(store.RunStoragePath(), sandboxID, store.StateFile) - - fileData, err := ioutil.ReadFile(stateFilePath) + config, err := loadSandboxConfig(sandboxID) if err != nil { return "", err } - state := types.SandboxState{} - - if err := json.Unmarshal(fileData, &state); err != nil { - return "", err - } - - if state.CgroupPath == "" { - return "", fmt.Errorf("get sandbox cgroup path error: cgroupPath is empty") + if config.Cgroups == nil { + return "", fmt.Errorf("the cgroups of sandbox %s is nil", sandboxID) } - return state.CgroupPath, nil + return config.Cgroups.Path, nil } // AddPidToSandboxCgroup add kata-runtime create process to cgroup @@ -276,3 +268,107 @@ func validCPUResources(cpuSpec *specs.LinuxCPU) *specs.LinuxCPU { return &cpu } + +// getQemuTaskWithoutVcpu filter out tasks under /proc/{qemu pid}/task, to find out the task of not VCPU, +// VCPU task is filtered by "query-cpus" qmp command +func getQemuTaskWithoutVcpu(sandbox *Sandbox, vmPid int) []int { + procPath := fmt.Sprintf("/proc/%d/task", vmPid) + + dirReader, err := os.Open(procPath) + if err != nil { + logrus.Warningf("cannot open %s: %s", procPath, err) + return nil + } + + defer dirReader.Close() + + dirs, err := dirReader.Readdirnames(0) + if err != nil { + logrus.Warningf("walking dirs in %s failed: %s", procPath, err) + return nil + } + + vcpuThreadInfo, err := sandbox.hypervisor.getThreadIDs() + if err != nil { + logrus.Warnf("get hypervisor Thread ID failed: %v", err) + return nil + } + + var vcpuThreadIDs []int + for _, value := range vcpuThreadInfo.vcpus { + vcpuThreadIDs = append(vcpuThreadIDs, value) + } + + var allThreadIDs []int + for _, dir := range dirs { + p, err := strconv.Atoi(dir) + if err != nil { + logrus.Warnf("can not change string dir: %s to int type", dir) + return nil + } + + allThreadIDs = append(allThreadIDs, p) + } + + nonVCPUThreads := diffSlice(allThreadIDs, vcpuThreadIDs) + + return nonVCPUThreads +} + +func pulloutQemuThread(sandbox *Sandbox, vmPid int, path string) error { + control, err := cgroups.New(cgroups.SingleSubsystem(cgroups.V1, cgroups.Cpu), + cgroups.StaticPath(path), + &specs.LinuxResources{}) + if err != nil { + return err + } + taskIds := getQemuTaskWithoutVcpu(sandbox, vmPid) + if len(taskIds) == 0 { + logrus.Warnf("no taskId id in qemu other than vcpu found of pid %d", vmPid) + return nil + } + for _, taskId := range taskIds { + if err := control.AddTask(cgroups.Process{ + Pid: taskId, + }); err != nil { + logrus.Errorf("failed to add task %d to cgroup of %s", taskId, path) + return err + } + } + + return nil +} + +// checkCgroupExist check cgroup exist or not +func checkCgroupExist(hierarchy cgroups.Hierarchy, path string) bool { + subSystems, _ := hierarchy() + for _, s := range cgroupPathers(subSystems) { + if _, err := os.Lstat(s.Path(path)); err != nil { + if os.IsNotExist(err) { + return false + } + } + } + + return true +} + +// diffSlice return the s1 - s2 +func diffSlice(s1, s2 []int) []int { + var diffSlice []int + for _, p := range s1 { + if !isInSlice(p, s2) { + diffSlice = append(diffSlice, p) + } + } + return diffSlice +} + +func isInSlice(i int, s []int) bool { + for _, v := range s { + if i == v { + return true + } + } + return false +} diff --git a/virtcontainers/persist/fs/fs.go b/virtcontainers/persist/fs/fs.go index 38efdba..641d64e 100644 --- a/virtcontainers/persist/fs/fs.go +++ b/virtcontainers/persist/fs/fs.go @@ -14,6 +14,8 @@ import ( "path/filepath" "syscall" + "github.com/opencontainers/runc/libcontainer/configs" + persistapi "github.com/kata-containers/runtime/virtcontainers/persist/api" "github.com/sirupsen/logrus" ) @@ -78,6 +80,12 @@ func (fs *FS) ToDisk(ss persistapi.SandboxState, cs map[string]persistapi.Contai return fmt.Errorf("sandbox container id required") } + if ss.Config.Cgroups == nil { + ss.Config.Cgroups = &configs.Cgroup{ + Path: ss.CgroupPath, + } + } + fs.sandboxState = &ss fs.containerState = cs diff --git a/virtcontainers/pkg/oci/utils.go b/virtcontainers/pkg/oci/utils.go index 91067fb..e8ef41b 100644 --- a/virtcontainers/pkg/oci/utils.go +++ b/virtcontainers/pkg/oci/utils.go @@ -1136,3 +1136,17 @@ func validateSandboxDNS(value string) error { return nil } + +func GetSandboxIDFromAnnotations(s *specs.Spec) (string, error) { + if s == nil { + return "", fmt.Errorf("spec is nil") + } + + for _, v := range CRISandboxNameKeyList { + if sandboxID, ok := s.Annotations[v]; ok { + return sandboxID, nil + } + } + + return "", fmt.Errorf("failed to find the sandbox ID") +} diff --git a/virtcontainers/sandbox.go b/virtcontainers/sandbox.go index b479cf5..ca4e700 100644 --- a/virtcontainers/sandbox.go +++ b/virtcontainers/sandbox.go @@ -2162,7 +2162,9 @@ func (s *Sandbox) cgroupsDelete() error { var cgroupSubsystems cgroups.Hierarchy if s.config.SandboxCgroupWithEmulator { - // emulator + if err := deleteCgroup(cgroups.V1, s.state.CgroupPath); err != nil { + return err + } } else if s.config.SandboxCgroupOnly { return s.cgroupMgr.Destroy() } @@ -2381,6 +2383,68 @@ func (s *Sandbox) setupSandboxCgroup() error { return nil } +func (s *Sandbox) setupHostCgroupsWithEmulator() error { + if len(s.config.Containers) == 0 { + return nil + } + + sandboxContainerSpec := s.GetPatchedOCISpec() + if sandboxContainerSpec == nil { + return fmt.Errorf("sandbox container should not be empty") + } + + // Set sandbox's cgroup path + s.state.CgroupPath = sandboxContainerSpec.Linux.CgroupsPath + + if !checkCgroupExist(cgroups.V1, s.state.CgroupPath) { + return fmt.Errorf("sandbox's cgroup %s doesn't exist", s.state.CgroupPath) + } + + // pull out qemu threads other than vcpu to the cgroup of "/emulator" + if s.config.HypervisorType == QemuHypervisor { + emulatorCgroupPath := filepath.Join(s.state.CgroupPath, emulatorCgroupName) + hypervisorPids := s.hypervisor.getPids() + if len(hypervisorPids) == 0 || hypervisorPids[0] == 0 { + return fmt.Errorf("hypervisor pid: %v invalid", hypervisorPids) + } + if err := pulloutQemuThread(s, hypervisorPids[0], emulatorCgroupPath); err != nil { + return err + } + } + + // limit cpu to "/vcpu" + vcpuCgroupPath := filepath.Join(s.state.CgroupPath, vcpuCgroupName) + vcpuResources := specs.LinuxResources{ + CPU: s.cpuResources(), + } + if err := applyResourceLimit(&vcpuResources, vcpuCgroupPath); err != nil { + return err + } + + // limit blkio resource to "" + + // limit files resource + + return nil +} + +func applyResourceLimit(resources *specs.LinuxResources, cgroupPath string) error { + if resources == nil { + return nil + } + + control, err := cgroupsLoadFunc(cgroups.V1, cgroups.StaticPath(cgroupPath)) + if err != nil { + return fmt.Errorf("could not load cgroup %v: %v", cgroupPath, err) + } + + if err = control.Update(resources); err != nil { + return fmt.Errorf("could not update cgroup %v: %v", cgroupPath, err) + } + + return nil +} + // GetPatchedOCISpec returns sandbox's OCI specification // This OCI specification was patched when the sandbox was created // by containerCapabilities(), SetEphemeralStorageType() and others @@ -2452,6 +2516,10 @@ func (s *Sandbox) forceDeleteSandbox() { c.forceDeleteContainer() } + if err := deleteCgroup(cgroups.V1, s.state.CgroupPath); err != nil { + s.Logger().Warnf("sandbox forceDelete cgroups failed: %v", err) + } + globalSandboxList.removeSandbox(s.id) if s.monitor != nil { -- 1.8.3.1