moby/daemon/start.go

package daemon

import (
	"context"
	"time"

	containerd "github.com/containerd/containerd/v2/client"
	"github.com/containerd/containerd/v2/core/containers"
	"github.com/containerd/log"
	"github.com/moby/moby/api/types/events"
	"github.com/moby/moby/v2/daemon/container"
	mobyc8dstore "github.com/moby/moby/v2/daemon/containerd"
	"github.com/moby/moby/v2/daemon/internal/libcontainerd"
	"github.com/moby/moby/v2/daemon/internal/metrics"
	"github.com/moby/moby/v2/daemon/internal/otelutil"
	"github.com/moby/moby/v2/daemon/server/backend"
	"github.com/moby/moby/v2/errdefs"
	"github.com/pkg/errors"
	"go.opentelemetry.io/otel"
	"go.opentelemetry.io/otel/attribute"
	"go.opentelemetry.io/otel/trace"
)

// validateState verifies if the container is in a non-conflicting state.
func validateState(ctr *container.Container) error {
	ctr.Lock()
	defer ctr.Unlock()

	// Intentionally checking paused first, because a container can be
	// BOTH running AND paused. To start a paused (but running) container,
	// it must be thawed ("un-paused").
	if ctr.State.Paused {
		return errdefs.Conflict(errors.New("cannot start a paused container, try unpause instead"))
	} else if ctr.State.Running {
		// This is not an actual error, but produces a 304 "not modified"
		// when returned through the API to indicates the container is
		// already in the desired state. It's implemented as an error
		// to make the code calling this function terminate early (as
		// no further processing is needed).
		return errdefs.NotModified(errors.New("container is already running"))
	}
	if ctr.State.RemovalInProgress || ctr.State.Dead {
		return errdefs.Conflict(errors.New("container is marked for removal and cannot be started"))
	}
	return nil
}

// ContainerStart starts a container.
func (daemon *Daemon) ContainerStart(ctx context.Context, name string, checkpoint string, checkpointDir string) error {
	daemonCfg := daemon.config()
	if checkpoint != "" && !daemonCfg.Experimental {
		return errdefs.InvalidParameter(errors.New("checkpoint is only supported in experimental mode"))
	}

	ctr, err := daemon.GetContainer(name)
	if err != nil {
		return err
	}
	if err := validateState(ctr); err != nil {
		return err
	}

	// check if hostConfig is in line with the current system settings.
	// It may happen cgroups are unmounted or the like.
	if _, err = daemon.verifyContainerSettings(daemonCfg, ctr.HostConfig, nil, false); err != nil {
		return errdefs.InvalidParameter(err)
	}

	return daemon.containerStart(ctx, daemonCfg, ctr, checkpoint, checkpointDir, true)
}

// containerStart prepares the container to run by setting up everything the
// container needs, such as storage and networking, as well as links
// between containers. The container is left waiting for a signal to
// begin running.
func (daemon *Daemon) containerStart(ctx context.Context, daemonCfg *configStore, container *container.Container, checkpoint string, checkpointDir string, resetRestartManager bool) (retErr error) {
	ctx, span := otel.Tracer("").Start(ctx, "daemon.containerStart", trace.WithAttributes(append(
		labelsAsOTelAttributes(container.Config.Labels),
		attribute.String("container.ID", container.ID),
		attribute.String("container.Name", container.Name),
	)...))
	defer func() {
		otelutil.RecordStatus(span, retErr)
		span.End()
	}()

	start := time.Now()
	container.Lock()
	defer container.Unlock()

	if resetRestartManager && container.State.Running { // skip this check if already in restarting step and resetRestartManager==false
		return nil
	}

	if container.State.RemovalInProgress || container.State.Dead {
		return errdefs.Conflict(errors.New("container is marked for removal and cannot be started"))
	}

	if checkpointDir != "" {
		// TODO(mlaventure): how would we support that?
		return errdefs.Forbidden(errors.New("custom checkpointdir is not supported"))
	}

	// if we encounter an error during start we need to ensure that any other
	// setup has been cleaned up properly
	defer func() {
		if retErr != nil {
			container.State.SetError(retErr)
			// if no one else has set it, make sure we don't leave it at zero
			if container.State.ExitCode() == 0 {
				container.State.SetExitCode(exitUnknown)
			}
			if err := container.CheckpointTo(context.WithoutCancel(ctx), daemon.containersReplica); err != nil {
				log.G(ctx).Errorf("%s: failed saving state on start failure: %v", container.ID, err)
			}
			container.Reset(false)

			daemon.Cleanup(context.WithoutCancel(ctx), container)
			// if containers AutoRemove flag is set, remove it after clean up
			if container.HostConfig.AutoRemove {
				container.Unlock()
				if err := daemon.containerRm(&daemonCfg.Config, container.ID, &backend.ContainerRmConfig{ForceRemove: true, RemoveVolume: true}); err != nil {
					log.G(ctx).Errorf("can't remove container %s: %v", container.ID, err)
				}
				container.Lock()
			}
		}
	}()

	if err := daemon.conditionalMountOnStart(container); err != nil {
		return err
	}

	newSandbox, err := daemon.initializeNetworking(ctx, &daemonCfg.Config, container)
	if err != nil {
		return err
	}
	defer func() {
		if retErr != nil && newSandbox != nil {
			if err := newSandbox.Delete(ctx); err != nil {
				log.G(ctx).WithFields(log.Fields{
					"error":     err,
					"container": container.ID,
				}).Warn("After failure in networking initialisation, failed to remove sandbox")
			}
		}
	}()

	mnts, err := daemon.setupContainerDirs(container)
	if err != nil {
		return err
	}

	m, cleanup, err := daemon.setupMounts(ctx, container)
	if err != nil {
		return err
	}
	mnts = append(mnts, m...)
	defer cleanup(context.WithoutCancel(ctx))

	spec, err := daemon.createSpec(ctx, daemonCfg, container, mnts)
	if err != nil {
		// Any error that occurs while creating the spec, even if it's the
		// result of an invalid container config, must be considered a System
		// error (internal server error), as it's not an error with the request
		// to start the container.
		//
		// Invalid configuration in the config itself must be validated when
		// creating the container (creating its config), but some errors are
		// dependent on the current state, for example when starting a container
		// that shares a namespace with another container, and that container
		// is not running (or missing).
		return errdefs.System(err)
	}

	if resetRestartManager {
		container.ResetRestartManager(true)
		container.HasBeenManuallyStopped = false
	}

	if err := daemon.saveAppArmorConfig(container); err != nil {
		return err
	}

	if checkpoint != "" {
		checkpointDir, err = getCheckpointDir(checkpointDir, checkpoint, container.Name, container.ID, container.CheckpointDir(), false)
		if err != nil {
			return err
		}
	}

	shim, createOptions, err := daemon.getLibcontainerdCreateOptions(daemonCfg, container)
	if err != nil {
		return err
	}

	ctr, err := libcontainerd.ReplaceContainer(ctx, daemon.containerd, container.ID, spec, shim, createOptions, func(ctx context.Context, client *containerd.Client, c *containers.Container) error {
		// Only set the image if we are using containerd for image storage.
		// This is for metadata purposes only.
		// Other lower-level components may make use of this information.
		is, ok := daemon.imageService.(*mobyc8dstore.ImageService)
		if !ok {
			return nil
		}
		img, err := is.ResolveImage(ctx, container.Config.Image)
		if err != nil {
			log.G(ctx).WithError(err).WithField("container", container.ID).Warn("Failed to resolve containerd image reference")
			return nil
		}
		c.Image = img.Name
		return nil
	})
	if err != nil {
		return setExitCodeFromError(container.State.SetExitCode, err)
	}
	defer func() {
		if retErr != nil {
			if err := ctr.Delete(context.WithoutCancel(ctx)); err != nil {
				log.G(ctx).WithError(err).WithField("container", container.ID).
					Error("failed to delete failed start container")
			}
		}
	}()

	startupTime := time.Now()
	// TODO(mlaventure): we need to specify checkpoint options here
	tsk, err := ctr.NewTask(context.WithoutCancel(ctx), // passing a cancelable ctx caused integration tests to be stuck in the cleanup phase
		checkpointDir, container.StreamConfig.Stdin() != nil || container.Config.Tty,
		container.InitializeStdio)
	if err != nil {
		return setExitCodeFromError(container.State.SetExitCode, err)
	}
	defer func() {
		if retErr != nil {
			if err := tsk.ForceDelete(context.WithoutCancel(ctx)); err != nil {
				log.G(ctx).WithError(err).WithField("container", container.ID).
					Error("failed to delete task after fail start")
			}
		}
	}()

	if err := daemon.initializeCreatedTask(ctx, &daemonCfg.Config, tsk, container, spec); err != nil {
		return err
	}

	if err := tsk.Start(context.WithoutCancel(ctx)); err != nil { // passing a cancelable ctx caused integration tests to be stuck in the cleanup phase
		return setExitCodeFromError(container.State.SetExitCode, err)
	}

	container.HasBeenManuallyRestarted = false
	container.State.SetRunning(ctr, tsk, startupTime)
	container.HasBeenStartedBefore = true
	daemon.setStateCounter(container)

	daemon.initHealthMonitor(container)

	if err := container.CheckpointTo(context.WithoutCancel(ctx), daemon.containersReplica); err != nil {
		log.G(ctx).WithError(err).WithField("container", container.ID).
			Errorf("failed to store container")
	}

	daemon.LogContainerEvent(container, events.ActionStart)
	metrics.ContainerActions.WithValues("start").UpdateSince(start)

	return nil
}

// Cleanup releases any network resources allocated to the container along with any rules
// around how containers are linked together.  It also unmounts the container's root filesystem.
func (daemon *Daemon) Cleanup(ctx context.Context, container *container.Container) {
	// Microsoft HCS containers get in a bad state if host resources are
	// released while the container still exists.
	if ctr, ok := container.State.C8dContainer(); ok {
		if err := ctr.Delete(context.Background()); err != nil {
			log.G(ctx).Errorf("%s cleanup: failed to delete container from containerd: %v", container.ID, err)
		}
	}

	daemon.releaseNetwork(ctx, container)

	if err := container.UnmountIpcMount(); err != nil {
		log.G(ctx).Warnf("%s cleanup: failed to unmount IPC: %s", container.ID, err)
	}

	if err := daemon.conditionalUnmountOnCleanup(container); err != nil {
		// FIXME: remove once reference counting for graphdrivers has been refactored
		// Ensure that all the mounts are gone
		if mountid, err := daemon.imageService.GetLayerMountID(container.ID); err == nil {
			daemon.cleanupMountsByID(mountid)
		}
	}

	if err := container.UnmountSecrets(); err != nil {
		log.G(ctx).Warnf("%s cleanup: failed to unmount secrets: %s", container.ID, err)
	}

	if err := recursiveUnmount(container.Root); err != nil {
		log.G(ctx).WithError(err).WithField("container", container.ID).Warn("Error while cleaning up container resource mounts.")
	}

	for _, eConfig := range container.ExecCommands.Commands() {
		daemon.unregisterExecCommand(container, eConfig)
	}

	if container.BaseFS != "" {
		if err := container.UnmountVolumes(ctx, daemon.LogVolumeEvent); err != nil {
			log.G(ctx).Warnf("%s cleanup: Failed to umount volumes: %v", container.ID, err)
		}
	}

	container.CancelAttachContext()
}