From ec9315cd4f9b0deace93ff088e769710b3183c2c Mon Sep 17 00:00:00 2001 From: Albin Kerouanton Date: Thu, 11 Dec 2025 17:42:21 +0100 Subject: [PATCH] daemon: clean up dead containers on start Stopping the Engine while a container with autoremove set is running may leave behind dead containers on disk. These containers aren't reclaimed on next start, appear as "dead" in `docker ps -a` and can't be inspected or removed by the user. This bug has existed since a long time but became user visible with 9f5f4f5a4273e920d5d77c1e73db8bebe65982bb. Prior to that commit, containers with no rwlayer weren't added to the in-memory viewdb, so they weren't visible in `docker ps -a`. However, some dangling files would still live on disk (e.g. folder in /var/lib/docker/containers, mount points, etc). The underlying issue is that when the daemon stops, it tries to stop all running containers and then closes the containerd client. This leaves a small window of time where the Engine might receive 'task stop' events from containerd, and trigger autoremove. If the containerd client is closed in parallel, the Engine is unable to complete the removal, leaving the container in 'dead' state. In such case, the Engine logs the following error: cannot remove container "bcbc98b4f5c2b072eb3c4ca673fa1c222d2a8af00bf58eae0f37085b9724ea46": Canceled: grpc: the client connection is closing: context canceled Solving the underlying issue would require complex changes to the shutdown sequence. Moreover, the same issue could also happen if the daemon crashes while it deletes a container. Thus, add a cleanup step on daemon startup to remove these dead containers. Signed-off-by: Albin Kerouanton --- daemon/daemon.go | 17 ++++++++++++----- integration/container/remove_test.go | 24 ++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/daemon/daemon.go b/daemon/daemon.go index a2849cbfbf..8e3b699a14 100644 --- a/daemon/daemon.go +++ b/daemon/daemon.go @@ -643,18 +643,25 @@ func (daemon *Daemon) restore(ctx context.Context, cfg *configStore, containers } group.Wait() - for id := range removeContainers { + for id, c := range removeContainers { group.Add(1) - go func(cid string) { + go func(cid string, c *container.Container) { _ = sem.Acquire(context.Background(), 1) + defer group.Done() + defer sem.Release(1) + + if c.State.IsDead() { + if err := daemon.cleanupContainer(c, backend.ContainerRmConfig{ForceRemove: true, RemoveVolume: true}); err != nil { + log.G(ctx).WithField("container", cid).WithError(err).Error("failed to remove dead container") + } + return + } if err := daemon.containerRm(&cfg.Config, cid, &backend.ContainerRmConfig{ForceRemove: true, RemoveVolume: true}); err != nil { log.G(ctx).WithField("container", cid).WithError(err).Error("failed to remove container") } - sem.Release(1) - group.Done() - }(id) + }(id, c) } group.Wait() diff --git a/integration/container/remove_test.go b/integration/container/remove_test.go index d36f728832..6a340b32f1 100644 --- a/integration/container/remove_test.go +++ b/integration/container/remove_test.go @@ -8,6 +8,7 @@ import ( containertypes "github.com/moby/moby/api/types/container" "github.com/moby/moby/client" "github.com/moby/moby/v2/integration/internal/container" + "github.com/moby/moby/v2/internal/testutil/daemon" "gotest.tools/v3/assert" is "gotest.tools/v3/assert/cmp" "gotest.tools/v3/fs" @@ -107,3 +108,26 @@ func TestRemoveInvalidContainer(t *testing.T) { assert.Check(t, is.ErrorType(err, cerrdefs.IsNotFound)) assert.Check(t, is.ErrorContains(err, "No such container")) } + +func TestRemoveDeadContainersOnDaemonRestart(t *testing.T) { + skip.If(t, testEnv.IsRemoteDaemon) + skip.If(t, testEnv.DaemonInfo.OSType == "windows", "FIXME: Windows CI does not support multiple daemons yet") + + ctx := setupTest(t) + d := daemon.New(t) + d.StartWithBusybox(ctx, t) + defer d.Stop(t) + + apiClient := d.NewClientT(t) + container.Run(ctx, t, apiClient, container.WithCmd("top"), container.WithAutoRemove) + + list, err := apiClient.ContainerList(ctx, client.ContainerListOptions{All: true}) + assert.NilError(t, err) + assert.Check(t, is.Len(list.Items, 1)) + + d.Restart(t) + + list, err = apiClient.ContainerList(ctx, client.ContainerListOptions{All: true}) + assert.NilError(t, err) + assert.Check(t, is.Len(list.Items, 0)) +}