From 3ac1232a2e3fbfc0465473e5d81cde41847c4252 Mon Sep 17 00:00:00 2001 From: jiangpengfei Date: Wed, 19 Aug 2020 11:47:37 +0800 Subject: [PATCH 11/16] agent: fix agent reap agent process blocked problem reason: add container waitProcess() timeout when container process status is D/T. Signed-off-by: jiangpengfei --- grpc.go | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/grpc.go b/grpc.go index de2cae7..3dd088e 100644 --- a/grpc.go +++ b/grpc.go @@ -49,6 +49,11 @@ const ( libcontainerPath = "/run/libcontainer" ) +// keep waitProcessTimeout value same as value in kata-runtime wait WaitProcessRequest response +const ( + waitProcessTimeOut = 10 +) + var ( sysfsCPUOnlinePath = "/sys/devices/system/cpu" sysfsMemOnlinePath = "/sys/devices/system/memory" @@ -996,17 +1001,35 @@ func (a *agentGRPC) WaitProcess(ctx context.Context, req *pb.WaitProcessRequest) ctr.deleteProcess(proc.id) }) - // Using helper function wait() to deal with the subreaper. - libContProcess := (*reaperLibcontainerProcess)(&(proc.process)) - exitCode, err := a.sandbox.subreaper.wait(proc.exitCodeCh, libContProcess) - if err != nil { - return &pb.WaitProcessResponse{}, err + done := make(chan error) + var exitCode int = 0 + go func() { + // Using helper function wait() to deal with the subreaper. + libContProcess := (*reaperLibcontainerProcess)(&(proc.process)) + var err error + exitCode, err = a.sandbox.subreaper.wait(proc.exitCodeCh, libContProcess) + if err != nil { + done <- err + close(done) + return + } + // refill the exitCodeCh with the exitcode which can be read out + // by another WaitProcess(). Since this channel isn't be closed, + // here the refill will always success and it will be free by GC + // once the process exits. + proc.exitCodeCh <- exitCode + + close(done) + }() + + select { + case err := <-done: + if err != nil { + return &pb.WaitProcessResponse{}, err + } + case <-time.After(time.Duration(waitProcessTimeOut) * time.Second): + return &pb.WaitProcessResponse{}, grpcStatus.Errorf(codes.DeadlineExceeded, "agent wait reap container process timeout reached after %ds", waitProcessTimeOut) } - //refill the exitCodeCh with the exitcode which can be read out - //by another WaitProcess(). Since this channel isn't be closed, - //here the refill will always success and it will be free by GC - //once the process exits. - proc.exitCodeCh <- exitCode return &pb.WaitProcessResponse{ Status: int32(exitCode), -- 2.14.3 (Apple Git-98)