Skip to content

Commit

Permalink
container: add a timeout for deletion
Browse files Browse the repository at this point in the history
On a heavily loaded host, we were experiencing long container(d) deletion
times:

    containerd[8907]: time="2024-03-25T13:47:45.938479195Z" level=debug msg="event forwarded" ns=moby topic=/tasks/exit type=containerd.events.TaskExit
    # our control plane logic deletes the successfully exited container via
    # the docker API, and...
    containerd[8907]: time="2024-03-25T13:47:47.202055216Z" level=debug msg="failed to delete task" error="context deadline exceeded" id=a618057629b35e3bfea82d5ce4cbb057ba979498496428dfe6935a1322b94add

Before 4bafaa0 ("Refactor libcontainerd to minimize c8d RPCs") when
this happens, the docker API reports a 255 exit code and no error:

    0a7ddd027c0497d5a titus-executor-[900884]: Processing msg from a docker: main container exited with code 255

which is especially confusing. After 4bafaa0, the behavior has changed
to report the container's real exit code, although there is still a hard
coded timeout after which containerd will (try to) stop cleaning up. We
would like to wait for this cleanup, so let's add a user configurable
DeleteTimeout here.

Reported-by: Hechao Li <hli@netflix.com>
Signed-off-by: Tycho Andersen <tandersen@netflix.com>
  • Loading branch information
tych0 committed Apr 18, 2024
1 parent 8383c48 commit 988e948
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 9 deletions.
13 changes: 7 additions & 6 deletions api/types/container/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,11 @@ type Config struct {
// Mac Address of the container.
//
// Deprecated: this field is deprecated since API v1.44. Use EndpointSettings.MacAddress instead.
MacAddress string `json:",omitempty"`
OnBuild []string // ONBUILD metadata that were defined on the image Dockerfile
Labels map[string]string // List of labels set to this container
StopSignal string `json:",omitempty"` // Signal to stop a container
StopTimeout *int `json:",omitempty"` // Timeout (in seconds) to stop a container
Shell strslice.StrSlice `json:",omitempty"` // Shell for shell-form of RUN, CMD, ENTRYPOINT
MacAddress string `json:",omitempty"`
OnBuild []string // ONBUILD metadata that were defined on the image Dockerfile
Labels map[string]string // List of labels set to this container
StopSignal string `json:",omitempty"` // Signal to stop a container
StopTimeout *int `json:",omitempty"` // Timeout (in seconds) to stop a container
DeleteTimeout *int `json:",omitempty"` // Timeout (in seconds) to wait for a container to be deleted
Shell strslice.StrSlice `json:",omitempty"` // Shell for shell-form of RUN, CMD, ENTRYPOINT
}
14 changes: 12 additions & 2 deletions container/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,9 @@ import (
)

const (
configFileName = "config.v2.json"
hostConfigFileName = "hostconfig.json"
configFileName = "config.v2.json"
hostConfigFileName = "hostconfig.json"
defaultDeleteTimeout = 30
)

// ExitStatus provides exit reasons for a container.
Expand Down Expand Up @@ -562,6 +563,15 @@ func (container *Container) StopTimeout() int {
return defaultStopTimeout
}

// DeleteTimeout returns the timeout (in seconds) to wait for a container to be deleted.
func (container *Container) DeleteTimeout() int {
if container.Config.DeleteTimeout != nil {
return *container.Config.DeleteTimeout
}

return defaultDeleteTimeout
}

// InitDNSHostConfig ensures that the dns fields are never nil.
// New containers don't ever have those fields nil,
// but pre created containers can still have those nil values.
Expand Down
20 changes: 20 additions & 0 deletions container/container_unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,26 @@ func TestContainerStopTimeout(t *testing.T) {
}
}

func TestContainerDeleteTimeout(t *testing.T) {
c := &Container{
Config: &container.Config{},
}

s := c.DeleteTimeout()
if s != defaultDeleteTimeout {
t.Fatalf("Expected %v, got %v", defaultDeleteTimeout, s)
}

deleteTimeout := 15
c = &Container{
Config: &container.Config{DeleteTimeout: &deleteTimeout},
}
s = c.DeleteTimeout()
if s != deleteTimeout {
t.Fatalf("Expected %v, got %v", deleteTimeout, s)
}
}

func TestContainerSecretReferenceDestTarget(t *testing.T) {
ref := &swarmtypes.SecretReference{
File: &swarmtypes.SecretReferenceFileTarget{
Expand Down
3 changes: 2 additions & 1 deletion daemon/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ func (daemon *Daemon) handleContainerExit(c *container.Container, e *libcontaine

tsk, ok := c.Task()
if ok {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
deleteTimeout := time.Duration(c.DeleteTimeout()) * time.Second
ctx, cancel := context.WithTimeout(context.Background(), deleteTimeout)
es, err := tsk.Delete(ctx)
cancel()
if err != nil {
Expand Down
10 changes: 10 additions & 0 deletions image/cache/compare.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,16 @@ func compare(a, b *container.Config) bool {
return false
}
}

if (a.DeleteTimeout == nil) != (b.DeleteTimeout == nil) {
return false
}
if a.DeleteTimeout != nil && b.DeleteTimeout != nil {
if *a.DeleteTimeout != *b.DeleteTimeout {
return false
}
}

if (a.Healthcheck == nil) != (b.Healthcheck == nil) {
return false
}
Expand Down

0 comments on commit 988e948

Please sign in to comment.