mirror of
https://github.com/moby/moby.git
synced 2026-01-11 10:41:43 +00:00
The Container.State struct holds the container's state, and most of
its fields are expected to change dynamically. Some o these state-changes
are explicit, for example, setting the container to be "stopped". Other
state changes can be more explicit, for example due to the containers'
process exiting or being "OOM" killed by the kernel.
The distinction between explicit ("desired") state changes and "state"
("actual state") is sometimes vague; for some properties, we clearly
separated them, for example if a user requested the container to be
stopped or restarted, we store state in the Container object itself;
HasBeenManuallyStopped bool // used for unless-stopped restart policy
HasBeenManuallyRestarted bool `json:"-"` // used to distinguish restart caused by restart policy from the manual one
Other properties are more ambiguous. such as "HasBeenStartedBefore" and
"RestartCount", which are stored on the Container (and persisted to
disk), but may be more related to "actual" state, and likely should
not be persisted;
RestartCount int
HasBeenStartedBefore bool
Given that (per the above) concurrency must be taken into account, most
changes to the `container.State` struct should be protected; here's where
things get blurry. While the `State` type provides various accessor methods,
only some of them take concurrency into account; for example, [State.IsRunning]
and [State.GetPID] acquire a lock, whereas [State.ExitCodeValue] does not.
Even the (commonly used) [State.StateString] has no locking at all.
The way to handle this is error-prone; [container.State] contains a mutex,
and it's exported. Given that its embedded in the [container.Container]
struct, it's also exposed as an exported mutex for the container. The
assumption here is that by "merging" the two, the caller to acquire a lock
when either the container _or_ its state must be mutated. However, because
some methods on `container.State` handle their own locking, consumers must
be deeply familiar with the internals; if both changes to the `Container`
AND `Container.State` must be made. This gets amplified more as some
(exported!) methods, such as [container.SetRunning] mutate multiple fields,
but don't acquire a lock (so expect the caller to hold one), but their
(also exported) counterpart (e.g. [State.IsRunning]) do.
It should be clear from the above, that this needs some architectural
changes; a clearer separation between "desired" and "actual" state (opening
the potential to update the container's config without manually touching
its `State`), possibly a method to obtain a read-only copy of the current
state (for those querying state), and reviewing which fields belong where
(and should be persisted to disk, or only remain in memory).
This PR preserves the status quo; it makes no structural changes, other
than exposing where we access the container's state. Where previously the
State fields and methods were referred to as "part of the container"
(e.g. `ctr.IsRunning()` or `ctr.Running`), we now explicitly reference
the embedded `State` (`ctr.State.IsRunning`, `ctr.State.Running`).
The exception (for now) is the mutex, which is still referenced through
the embedded struct (`ctr.Lock()` instead of `ctr.State.Lock()`), as this
is (mostly) by design to protect the container, and what's in it (including
its `State`).
[State.IsRunning]: c4afa77157/daemon/container/state.go (L205-L209)
[State.GetPID]: c4afa77157/daemon/container/state.go (L211-L216)
[State.ExitCodeValue]: c4afa77157/daemon/container/state.go (L218-L228)
[State.StateString]: c4afa77157/daemon/container/state.go (L102-L131)
[container.State]: c4afa77157/daemon/container/state.go (L15-L23)
[container.Container]: c4afa77157/daemon/container/container.go (L67-L75)
[container.SetRunning]: c4afa77157/daemon/container/state.go (L230-L277)
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
313 lines
11 KiB
Go
313 lines
11 KiB
Go
package daemon
|
|
|
|
import (
|
|
"context"
|
|
"time"
|
|
|
|
containerd "github.com/containerd/containerd/v2/client"
|
|
"github.com/containerd/containerd/v2/core/containers"
|
|
"github.com/containerd/log"
|
|
"github.com/moby/moby/api/types/events"
|
|
"github.com/moby/moby/v2/daemon/container"
|
|
mobyc8dstore "github.com/moby/moby/v2/daemon/containerd"
|
|
"github.com/moby/moby/v2/daemon/internal/libcontainerd"
|
|
"github.com/moby/moby/v2/daemon/internal/metrics"
|
|
"github.com/moby/moby/v2/daemon/internal/otelutil"
|
|
"github.com/moby/moby/v2/daemon/server/backend"
|
|
"github.com/moby/moby/v2/errdefs"
|
|
"github.com/pkg/errors"
|
|
"go.opentelemetry.io/otel"
|
|
"go.opentelemetry.io/otel/attribute"
|
|
"go.opentelemetry.io/otel/trace"
|
|
)
|
|
|
|
// validateState verifies if the container is in a non-conflicting state.
|
|
func validateState(ctr *container.Container) error {
|
|
ctr.Lock()
|
|
defer ctr.Unlock()
|
|
|
|
// Intentionally checking paused first, because a container can be
|
|
// BOTH running AND paused. To start a paused (but running) container,
|
|
// it must be thawed ("un-paused").
|
|
if ctr.State.Paused {
|
|
return errdefs.Conflict(errors.New("cannot start a paused container, try unpause instead"))
|
|
} else if ctr.State.Running {
|
|
// This is not an actual error, but produces a 304 "not modified"
|
|
// when returned through the API to indicates the container is
|
|
// already in the desired state. It's implemented as an error
|
|
// to make the code calling this function terminate early (as
|
|
// no further processing is needed).
|
|
return errdefs.NotModified(errors.New("container is already running"))
|
|
}
|
|
if ctr.State.RemovalInProgress || ctr.State.Dead {
|
|
return errdefs.Conflict(errors.New("container is marked for removal and cannot be started"))
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// ContainerStart starts a container.
|
|
func (daemon *Daemon) ContainerStart(ctx context.Context, name string, checkpoint string, checkpointDir string) error {
|
|
daemonCfg := daemon.config()
|
|
if checkpoint != "" && !daemonCfg.Experimental {
|
|
return errdefs.InvalidParameter(errors.New("checkpoint is only supported in experimental mode"))
|
|
}
|
|
|
|
ctr, err := daemon.GetContainer(name)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if err := validateState(ctr); err != nil {
|
|
return err
|
|
}
|
|
|
|
// check if hostConfig is in line with the current system settings.
|
|
// It may happen cgroups are unmounted or the like.
|
|
if _, err = daemon.verifyContainerSettings(daemonCfg, ctr.HostConfig, nil, false); err != nil {
|
|
return errdefs.InvalidParameter(err)
|
|
}
|
|
|
|
return daemon.containerStart(ctx, daemonCfg, ctr, checkpoint, checkpointDir, true)
|
|
}
|
|
|
|
// containerStart prepares the container to run by setting up everything the
|
|
// container needs, such as storage and networking, as well as links
|
|
// between containers. The container is left waiting for a signal to
|
|
// begin running.
|
|
func (daemon *Daemon) containerStart(ctx context.Context, daemonCfg *configStore, container *container.Container, checkpoint string, checkpointDir string, resetRestartManager bool) (retErr error) {
|
|
ctx, span := otel.Tracer("").Start(ctx, "daemon.containerStart", trace.WithAttributes(append(
|
|
labelsAsOTelAttributes(container.Config.Labels),
|
|
attribute.String("container.ID", container.ID),
|
|
attribute.String("container.Name", container.Name),
|
|
)...))
|
|
defer func() {
|
|
otelutil.RecordStatus(span, retErr)
|
|
span.End()
|
|
}()
|
|
|
|
start := time.Now()
|
|
container.Lock()
|
|
defer container.Unlock()
|
|
|
|
if resetRestartManager && container.State.Running { // skip this check if already in restarting step and resetRestartManager==false
|
|
return nil
|
|
}
|
|
|
|
if container.State.RemovalInProgress || container.State.Dead {
|
|
return errdefs.Conflict(errors.New("container is marked for removal and cannot be started"))
|
|
}
|
|
|
|
if checkpointDir != "" {
|
|
// TODO(mlaventure): how would we support that?
|
|
return errdefs.Forbidden(errors.New("custom checkpointdir is not supported"))
|
|
}
|
|
|
|
// if we encounter an error during start we need to ensure that any other
|
|
// setup has been cleaned up properly
|
|
defer func() {
|
|
if retErr != nil {
|
|
container.State.SetError(retErr)
|
|
// if no one else has set it, make sure we don't leave it at zero
|
|
if container.State.ExitCode() == 0 {
|
|
container.State.SetExitCode(exitUnknown)
|
|
}
|
|
if err := container.CheckpointTo(context.WithoutCancel(ctx), daemon.containersReplica); err != nil {
|
|
log.G(ctx).Errorf("%s: failed saving state on start failure: %v", container.ID, err)
|
|
}
|
|
container.Reset(false)
|
|
|
|
daemon.Cleanup(context.WithoutCancel(ctx), container)
|
|
// if containers AutoRemove flag is set, remove it after clean up
|
|
if container.HostConfig.AutoRemove {
|
|
container.Unlock()
|
|
if err := daemon.containerRm(&daemonCfg.Config, container.ID, &backend.ContainerRmConfig{ForceRemove: true, RemoveVolume: true}); err != nil {
|
|
log.G(ctx).Errorf("can't remove container %s: %v", container.ID, err)
|
|
}
|
|
container.Lock()
|
|
}
|
|
}
|
|
}()
|
|
|
|
if err := daemon.conditionalMountOnStart(container); err != nil {
|
|
return err
|
|
}
|
|
|
|
newSandbox, err := daemon.initializeNetworking(ctx, &daemonCfg.Config, container)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer func() {
|
|
if retErr != nil && newSandbox != nil {
|
|
if err := newSandbox.Delete(ctx); err != nil {
|
|
log.G(ctx).WithFields(log.Fields{
|
|
"error": err,
|
|
"container": container.ID,
|
|
}).Warn("After failure in networking initialisation, failed to remove sandbox")
|
|
}
|
|
}
|
|
}()
|
|
|
|
mnts, err := daemon.setupContainerDirs(container)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
m, cleanup, err := daemon.setupMounts(ctx, container)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
mnts = append(mnts, m...)
|
|
defer cleanup(context.WithoutCancel(ctx))
|
|
|
|
spec, err := daemon.createSpec(ctx, daemonCfg, container, mnts)
|
|
if err != nil {
|
|
// Any error that occurs while creating the spec, even if it's the
|
|
// result of an invalid container config, must be considered a System
|
|
// error (internal server error), as it's not an error with the request
|
|
// to start the container.
|
|
//
|
|
// Invalid configuration in the config itself must be validated when
|
|
// creating the container (creating its config), but some errors are
|
|
// dependent on the current state, for example when starting a container
|
|
// that shares a namespace with another container, and that container
|
|
// is not running (or missing).
|
|
return errdefs.System(err)
|
|
}
|
|
|
|
if resetRestartManager {
|
|
container.ResetRestartManager(true)
|
|
container.HasBeenManuallyStopped = false
|
|
}
|
|
|
|
if err := daemon.saveAppArmorConfig(container); err != nil {
|
|
return err
|
|
}
|
|
|
|
if checkpoint != "" {
|
|
checkpointDir, err = getCheckpointDir(checkpointDir, checkpoint, container.Name, container.ID, container.CheckpointDir(), false)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
shim, createOptions, err := daemon.getLibcontainerdCreateOptions(daemonCfg, container)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
ctr, err := libcontainerd.ReplaceContainer(ctx, daemon.containerd, container.ID, spec, shim, createOptions, func(ctx context.Context, client *containerd.Client, c *containers.Container) error {
|
|
// Only set the image if we are using containerd for image storage.
|
|
// This is for metadata purposes only.
|
|
// Other lower-level components may make use of this information.
|
|
is, ok := daemon.imageService.(*mobyc8dstore.ImageService)
|
|
if !ok {
|
|
return nil
|
|
}
|
|
img, err := is.ResolveImage(ctx, container.Config.Image)
|
|
if err != nil {
|
|
log.G(ctx).WithError(err).WithField("container", container.ID).Warn("Failed to resolve containerd image reference")
|
|
return nil
|
|
}
|
|
c.Image = img.Name
|
|
return nil
|
|
})
|
|
if err != nil {
|
|
return setExitCodeFromError(container.State.SetExitCode, err)
|
|
}
|
|
defer func() {
|
|
if retErr != nil {
|
|
if err := ctr.Delete(context.WithoutCancel(ctx)); err != nil {
|
|
log.G(ctx).WithError(err).WithField("container", container.ID).
|
|
Error("failed to delete failed start container")
|
|
}
|
|
}
|
|
}()
|
|
|
|
startupTime := time.Now()
|
|
// TODO(mlaventure): we need to specify checkpoint options here
|
|
tsk, err := ctr.NewTask(context.WithoutCancel(ctx), // passing a cancelable ctx caused integration tests to be stuck in the cleanup phase
|
|
checkpointDir, container.StreamConfig.Stdin() != nil || container.Config.Tty,
|
|
container.InitializeStdio)
|
|
if err != nil {
|
|
return setExitCodeFromError(container.State.SetExitCode, err)
|
|
}
|
|
defer func() {
|
|
if retErr != nil {
|
|
if err := tsk.ForceDelete(context.WithoutCancel(ctx)); err != nil {
|
|
log.G(ctx).WithError(err).WithField("container", container.ID).
|
|
Error("failed to delete task after fail start")
|
|
}
|
|
}
|
|
}()
|
|
|
|
if err := daemon.initializeCreatedTask(ctx, &daemonCfg.Config, tsk, container, spec); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := tsk.Start(context.WithoutCancel(ctx)); err != nil { // passing a cancelable ctx caused integration tests to be stuck in the cleanup phase
|
|
return setExitCodeFromError(container.State.SetExitCode, err)
|
|
}
|
|
|
|
container.HasBeenManuallyRestarted = false
|
|
container.State.SetRunning(ctr, tsk, startupTime)
|
|
container.HasBeenStartedBefore = true
|
|
daemon.setStateCounter(container)
|
|
|
|
daemon.initHealthMonitor(container)
|
|
|
|
if err := container.CheckpointTo(context.WithoutCancel(ctx), daemon.containersReplica); err != nil {
|
|
log.G(ctx).WithError(err).WithField("container", container.ID).
|
|
Errorf("failed to store container")
|
|
}
|
|
|
|
daemon.LogContainerEvent(container, events.ActionStart)
|
|
metrics.ContainerActions.WithValues("start").UpdateSince(start)
|
|
|
|
return nil
|
|
}
|
|
|
|
// Cleanup releases any network resources allocated to the container along with any rules
|
|
// around how containers are linked together. It also unmounts the container's root filesystem.
|
|
func (daemon *Daemon) Cleanup(ctx context.Context, container *container.Container) {
|
|
// Microsoft HCS containers get in a bad state if host resources are
|
|
// released while the container still exists.
|
|
if ctr, ok := container.State.C8dContainer(); ok {
|
|
if err := ctr.Delete(context.Background()); err != nil {
|
|
log.G(ctx).Errorf("%s cleanup: failed to delete container from containerd: %v", container.ID, err)
|
|
}
|
|
}
|
|
|
|
daemon.releaseNetwork(ctx, container)
|
|
|
|
if err := container.UnmountIpcMount(); err != nil {
|
|
log.G(ctx).Warnf("%s cleanup: failed to unmount IPC: %s", container.ID, err)
|
|
}
|
|
|
|
if err := daemon.conditionalUnmountOnCleanup(container); err != nil {
|
|
// FIXME: remove once reference counting for graphdrivers has been refactored
|
|
// Ensure that all the mounts are gone
|
|
if mountid, err := daemon.imageService.GetLayerMountID(container.ID); err == nil {
|
|
daemon.cleanupMountsByID(mountid)
|
|
}
|
|
}
|
|
|
|
if err := container.UnmountSecrets(); err != nil {
|
|
log.G(ctx).Warnf("%s cleanup: failed to unmount secrets: %s", container.ID, err)
|
|
}
|
|
|
|
if err := recursiveUnmount(container.Root); err != nil {
|
|
log.G(ctx).WithError(err).WithField("container", container.ID).Warn("Error while cleaning up container resource mounts.")
|
|
}
|
|
|
|
for _, eConfig := range container.ExecCommands.Commands() {
|
|
daemon.unregisterExecCommand(container, eConfig)
|
|
}
|
|
|
|
if container.BaseFS != "" {
|
|
if err := container.UnmountVolumes(ctx, daemon.LogVolumeEvent); err != nil {
|
|
log.G(ctx).Warnf("%s cleanup: Failed to umount volumes: %v", container.ID, err)
|
|
}
|
|
}
|
|
|
|
container.CancelAttachContext()
|
|
}
|