From ac8b4e3e75401b4739e62f8339c2efcf809ca10d Mon Sep 17 00:00:00 2001 From: Albin Kerouanton Date: Thu, 20 Feb 2025 14:43:11 +0100 Subject: [PATCH] daemon: handleContainerExit: ignore networking errors Prior to commit fe856b9, containers' network sandbox and interfaces were created before the containerd task. Now, it's created after. If this step fails, the containerd task is forcefully deleted, and an event is sent to the c8d event monitor, which triggers `handleContainerExit`. Then this method tries to restart the faulty container. This leads to containers with a published port already in use to be stuck in a tight restart loop (if they're started with `--restart=always`) until the port is available. This is needlessly spamming the daemon logs. Prior to that commit, a published port already in use wouldn't trigger the restart process. This commit adds a check to `handleContainerExit` to ignore exit events if the latest container error is related to networking setup. Signed-off-by: Albin Kerouanton --- daemon/container_operations.go | 2 ++ daemon/monitor.go | 15 +++++++++ daemon/start_linux.go | 5 ++- .../network/bridge/bridge_linux_test.go | 31 +++++++++++++++++++ 4 files changed, 52 insertions(+), 1 deletion(-) diff --git a/daemon/container_operations.go b/daemon/container_operations.go index c4bfbdcaac..d07fe8dc45 100644 --- a/daemon/container_operations.go +++ b/daemon/container_operations.go @@ -37,6 +37,8 @@ import ( "go.opentelemetry.io/otel/trace" ) +const errSetupNetworking = "failed to set up container networking" + func ipAddresses(ips []net.IP) []string { var addrs []string for _, ip := range ips { diff --git a/daemon/monitor.go b/daemon/monitor.go index e6e2dfc6fe..a1d037cdab 100644 --- a/daemon/monitor.go +++ b/daemon/monitor.go @@ -3,6 +3,7 @@ package daemon // import "github.com/docker/docker/daemon" import ( "context" "strconv" + "strings" "time" "github.com/containerd/log" @@ -32,6 +33,20 @@ func (daemon *Daemon) handleContainerExit(c *container.Container, e *libcontaine var ctrExitStatus container.ExitStatus c.Lock() + // If the latest container error is related to networking setup, don't try + // to restart the container, and don't change the container state to + // 'exited'. This happens when, for example, [daemon.allocateNetwork] fails + // due to published ports being already in use. In that case, we want to + // keep the container in the 'created' state. + // + // c.ErrorMsg is set by [daemon.containerStart], and doesn't preserve the + // error type (because this field is persisted on disk). So, use string + // matching instead of usual error comparison methods. + if strings.Contains(c.ErrorMsg, errSetupNetworking) { + c.Unlock() + return nil + } + cfg := daemon.config() // Health checks will be automatically restarted if/when the diff --git a/daemon/start_linux.go b/daemon/start_linux.go index 8be536ecc5..fc79938c25 100644 --- a/daemon/start_linux.go +++ b/daemon/start_linux.go @@ -34,5 +34,8 @@ func (daemon *Daemon) initializeCreatedTask( return errdefs.System(err) } } - return daemon.allocateNetwork(ctx, cfg, ctr) + if err := daemon.allocateNetwork(ctx, cfg, ctr); err != nil { + return fmt.Errorf("%s: %w", errSetupNetworking, err) + } + return nil } diff --git a/integration/network/bridge/bridge_linux_test.go b/integration/network/bridge/bridge_linux_test.go index 91c92c0ef5..3b89e02232 100644 --- a/integration/network/bridge/bridge_linux_test.go +++ b/integration/network/bridge/bridge_linux_test.go @@ -19,6 +19,7 @@ import ( "github.com/docker/docker/libnetwork/netlabel" "github.com/docker/docker/testutil" "github.com/docker/docker/testutil/daemon" + "github.com/docker/go-connections/nat" "github.com/vishvananda/netlink" "gotest.tools/v3/assert" is "gotest.tools/v3/assert/cmp" @@ -423,3 +424,33 @@ func TestEndpointWithCustomIfname(t *testing.T) { assert.NilError(t, err) assert.Assert(t, strings.Contains(out.Stdout, ": foobar@if"), "expected ': foobar@if' in 'ip link show':\n%s", out.Stdout) } + +// TestPublishedPortAlreadyInUse checks that a container that can't start +// because of one its published port being already in use doesn't end up +// triggering the restart loop. +// +// Regression test for: https://github.com/moby/moby/issues/49501 +func TestPublishedPortAlreadyInUse(t *testing.T) { + ctx := setupTest(t) + apiClient := testEnv.APIClient() + + ctr1 := ctr.Run(ctx, t, apiClient, + ctr.WithCmd("top"), + ctr.WithExposedPorts("80/tcp"), + ctr.WithPortMap(nat.PortMap{"80/tcp": {{HostPort: "8000"}}})) + defer ctr.Remove(ctx, t, apiClient, ctr1, containertypes.RemoveOptions{Force: true}) + + ctr2 := ctr.Create(ctx, t, apiClient, + ctr.WithCmd("top"), + ctr.WithRestartPolicy(containertypes.RestartPolicyAlways), + ctr.WithExposedPorts("80/tcp"), + ctr.WithPortMap(nat.PortMap{"80/tcp": {{HostPort: "8000"}}})) + defer ctr.Remove(ctx, t, apiClient, ctr2, containertypes.RemoveOptions{Force: true}) + + err := apiClient.ContainerStart(ctx, ctr2, containertypes.StartOptions{}) + assert.Assert(t, is.ErrorContains(err, "failed to set up container networking")) + + inspect, err := apiClient.ContainerInspect(ctx, ctr2) + assert.NilError(t, err) + assert.Check(t, is.Equal(inspect.State.Status, "created")) +}