diff --git a/daemon/archive_unix.go b/daemon/archive_unix.go index 9486de0dee..3098dbeadf 100644 --- a/daemon/archive_unix.go +++ b/daemon/archive_unix.go @@ -4,6 +4,7 @@ package daemon // import "github.com/docker/docker/daemon" import ( + "context" "io" "os" "path/filepath" @@ -12,7 +13,6 @@ import ( "github.com/docker/docker/container" "github.com/docker/docker/errdefs" "github.com/docker/docker/pkg/archive" - "github.com/docker/docker/pkg/chrootarchive" "github.com/docker/docker/pkg/ioutils" volumemounts "github.com/docker/docker/volume/mounts" "github.com/pkg/errors" @@ -24,23 +24,13 @@ func (daemon *Daemon) containerStatPath(container *container.Container, path str container.Lock() defer container.Unlock() - if err = daemon.Mount(container); err != nil { - return nil, err - } - defer daemon.Unmount(container) - - err = daemon.mountVolumes(container) - defer container.DetachAndUnmount(daemon.LogVolumeEvent) + cfs, err := daemon.openContainerFS(container) if err != nil { return nil, err } + defer cfs.Close() - resolvedPath, absPath, err := container.ResolvePath(path) - if err != nil { - return nil, err - } - - return container.StatPath(resolvedPath, absPath) + return cfs.Stat(context.TODO(), path) } // containerArchivePath creates an archive of the filesystem resource at the specified @@ -58,66 +48,40 @@ func (daemon *Daemon) containerArchivePath(container *container.Container, path } }() - if err = daemon.Mount(container); err != nil { + cfs, err := daemon.openContainerFS(container) + if err != nil { return nil, nil, err } defer func() { if err != nil { - // unmount any volumes - container.DetachAndUnmount(daemon.LogVolumeEvent) - // unmount the container's rootfs - daemon.Unmount(container) + cfs.Close() } }() - if err = daemon.mountVolumes(container); err != nil { - return nil, nil, err - } + absPath := archive.PreserveTrailingDotOrSeparator(filepath.Join("/", path), path) - resolvedPath, absPath, err := container.ResolvePath(path) + stat, err = cfs.Stat(context.TODO(), absPath) if err != nil { return nil, nil, err } - stat, err = container.StatPath(resolvedPath, absPath) - if err != nil { - return nil, nil, err - } - - // We need to rebase the archive entries if the last element of the - // resolved path was a symlink that was evaluated and is now different - // than the requested path. For example, if the given path was "/foo/bar/", - // but it resolved to "/var/lib/docker/containers/{id}/foo/baz/", we want - // to ensure that the archive entries start with "bar" and not "baz". This - // also catches the case when the root directory of the container is - // requested: we want the archive entries to start with "/" and not the - // container ID. - - // Get the source and the base paths of the container resolved path in order - // to get the proper tar options for the rebase tar. - resolvedPath = filepath.Clean(resolvedPath) - if filepath.Base(resolvedPath) == "." { - resolvedPath += string(filepath.Separator) + "." - } - - sourceDir := resolvedPath - sourceBase := "." - + sourceDir, sourceBase := absPath, "." if stat.Mode&os.ModeDir == 0 { // not dir - sourceDir, sourceBase = filepath.Split(resolvedPath) + sourceDir, sourceBase = filepath.Split(absPath) } opts := archive.TarResourceRebaseOpts(sourceBase, filepath.Base(absPath)) - data, err := chrootarchive.Tar(sourceDir, opts, container.BaseFS) + tb, err := archive.NewTarballer(sourceDir, opts) if err != nil { return nil, nil, err } + cfs.GoInFS(context.TODO(), tb.Do) + data := tb.Reader() content = ioutils.NewReadCloserWrapper(data, func() error { err := data.Close() - container.DetachAndUnmount(daemon.LogVolumeEvent) - daemon.Unmount(container) + _ = cfs.Close() container.Unlock() return err }) @@ -137,77 +101,58 @@ func (daemon *Daemon) containerExtractToDir(container *container.Container, path container.Lock() defer container.Unlock() - if err = daemon.Mount(container); err != nil { - return err - } - defer daemon.Unmount(container) - - err = daemon.mountVolumes(container) - defer container.DetachAndUnmount(daemon.LogVolumeEvent) + cfs, err := daemon.openContainerFS(container) if err != nil { return err } + defer cfs.Close() - // The destination path needs to be resolved to a host path, with all - // symbolic links followed in the scope of the container's rootfs. Note - // that we do not use `container.ResolvePath(path)` here because we need - // to also evaluate the last path element if it is a symlink. This is so - // that you can extract an archive to a symlink that points to a directory. - - // Consider the given path as an absolute path in the container. - absPath := archive.PreserveTrailingDotOrSeparator(filepath.Join(string(filepath.Separator), path), path) - - // This will evaluate the last path element if it is a symlink. - resolvedPath, err := container.GetResourcePath(absPath) - if err != nil { - return err - } - - stat, err := os.Lstat(resolvedPath) - if err != nil { - return err - } - - if !stat.IsDir() { - return errdefs.InvalidParameter(errors.New("extraction point is not a directory")) - } - - // Need to check if the path is in a volume. If it is, it cannot be in a - // read-only volume. If it is not in a volume, the container cannot be - // configured with a read-only rootfs. - - // Use the resolved path relative to the container rootfs as the new - // absPath. This way we fully follow any symlinks in a volume that may - // lead back outside the volume. - baseRel, err := filepath.Rel(container.BaseFS, resolvedPath) - if err != nil { - return err - } - // Make it an absolute path. - absPath = filepath.Join(string(filepath.Separator), baseRel) - - toVolume, err := checkIfPathIsInAVolume(container, absPath) - if err != nil { - return err - } - - if !toVolume && container.HostConfig.ReadonlyRootfs { - return errdefs.InvalidParameter(errors.New("container rootfs is marked read-only")) - } - - options := daemon.defaultTarCopyOptions(noOverwriteDirNonDir) - - if copyUIDGID { - var err error - // tarCopyOptions will appropriately pull in the right uid/gid for the - // user/group and will set the options. - options, err = daemon.tarCopyOptions(container, noOverwriteDirNonDir) + err = cfs.RunInFS(context.TODO(), func() error { + // The destination path needs to be resolved with all symbolic links + // followed. Note that we need to also evaluate the last path element if + // it is a symlink. This is so that you can extract an archive to a + // symlink that points to a directory. + absPath, err := filepath.EvalSymlinks(filepath.Join("/", path)) if err != nil { return err } - } + absPath = archive.PreserveTrailingDotOrSeparator(absPath, path) - if err := chrootarchive.UntarWithRoot(content, resolvedPath, options, container.BaseFS); err != nil { + stat, err := os.Lstat(absPath) + if err != nil { + return err + } + if !stat.IsDir() { + return errdefs.InvalidParameter(errors.New("extraction point is not a directory")) + } + + // Need to check if the path is in a volume. If it is, it cannot be in a + // read-only volume. If it is not in a volume, the container cannot be + // configured with a read-only rootfs. + toVolume, err := checkIfPathIsInAVolume(container, absPath) + if err != nil { + return err + } + + if !toVolume && container.HostConfig.ReadonlyRootfs { + return errdefs.InvalidParameter(errors.New("container rootfs is marked read-only")) + } + + options := daemon.defaultTarCopyOptions(noOverwriteDirNonDir) + + if copyUIDGID { + var err error + // tarCopyOptions will appropriately pull in the right uid/gid for the + // user/group and will set the options. + options, err = daemon.tarCopyOptions(container, noOverwriteDirNonDir) + if err != nil { + return err + } + } + + return archive.Untar(content, absPath, options) + }) + if err != nil { return err } @@ -217,9 +162,6 @@ func (daemon *Daemon) containerExtractToDir(container *container.Container, path } func (daemon *Daemon) containerCopy(container *container.Container, resource string) (rc io.ReadCloser, err error) { - if resource[0] == '/' || resource[0] == '\\' { - resource = resource[1:] - } container.Lock() defer func() { @@ -231,49 +173,36 @@ func (daemon *Daemon) containerCopy(container *container.Container, resource str } }() - if err := daemon.Mount(container); err != nil { + cfs, err := daemon.openContainerFS(container) + if err != nil { return nil, err } - defer func() { if err != nil { - // unmount any volumes - container.DetachAndUnmount(daemon.LogVolumeEvent) - // unmount the container's rootfs - daemon.Unmount(container) + cfs.Close() } }() - if err := daemon.mountVolumes(container); err != nil { - return nil, err - } - - basePath, err := container.GetResourcePath(resource) - if err != nil { - return nil, err - } - stat, err := os.Stat(basePath) - if err != nil { - return nil, err - } - var filter []string - if !stat.IsDir() { - d, f := filepath.Split(basePath) - basePath = d - filter = []string{f} - } - archv, err := chrootarchive.Tar(basePath, &archive.TarOptions{ - Compression: archive.Uncompressed, - IncludeFiles: filter, - }, container.BaseFS) + err = cfs.RunInFS(context.TODO(), func() error { + _, err := os.Stat(resource) + return err + }) if err != nil { return nil, err } + tb, err := archive.NewTarballer(resource, &archive.TarOptions{ + Compression: archive.Uncompressed, + }) + if err != nil { + return nil, err + } + + cfs.GoInFS(context.TODO(), tb.Do) + archv := tb.Reader() reader := ioutils.NewReadCloserWrapper(archv, func() error { err := archv.Close() - container.DetachAndUnmount(daemon.LogVolumeEvent) - daemon.Unmount(container) + _ = cfs.Close() container.Unlock() return err }) diff --git a/daemon/containerfs_linux.go b/daemon/containerfs_linux.go new file mode 100644 index 0000000000..b7420b9244 --- /dev/null +++ b/daemon/containerfs_linux.go @@ -0,0 +1,221 @@ +package daemon // import "github.com/docker/docker/daemon" + +import ( + "context" + "os" + "path/filepath" + "runtime" + "strings" + + "github.com/hashicorp/go-multierror" + "github.com/moby/sys/mount" + "github.com/moby/sys/symlink" + "golang.org/x/sys/unix" + + "github.com/docker/docker/api/types" + "github.com/docker/docker/container" + "github.com/docker/docker/internal/mounttree" + "github.com/docker/docker/internal/unshare" + "github.com/docker/docker/pkg/fileutils" +) + +type future struct { + fn func() error + res chan<- error +} + +// containerFSView allows functions to be run in the context of a container's +// filesystem. Inside these functions, the root directory is the container root +// for all native OS filesystem APIs, including, but not limited to, the [os] +// and [golang.org/x/sys/unix] packages. The view of the container's filesystem +// is live and read-write. Each view has its own private set of tmpfs mounts. +// Any files written under a tmpfs mount are not visible to processes inside the +// container nor any other view of the container's filesystem, and vice versa. +// +// Each view has its own current working directory which is initialized to the +// root of the container filesystem and can be changed with [os.Chdir]. Changes +// to the current directory persist across successive [*containerFSView.RunInFS] +// and [*containerFSView.GoInFS] calls. +// +// Multiple views of the same container filesystem can coexist at the same time. +// Only one function can be running in a particular filesystem view at any given +// time. Calls to [*containerFSView.RunInFS] or [*containerFSView.GoInFS] will +// block while another function is running. If more than one call is blocked +// concurrently, the order they are unblocked is undefined. +type containerFSView struct { + d *Daemon + ctr *container.Container + todo chan future + done chan error +} + +// openContainerFS opens a new view of the container's filesystem. +func (daemon *Daemon) openContainerFS(container *container.Container) (_ *containerFSView, err error) { + if err := daemon.Mount(container); err != nil { + return nil, err + } + defer func() { + if err != nil { + _ = daemon.Unmount(container) + } + }() + + mounts, err := daemon.setupMounts(container) + if err != nil { + return nil, err + } + defer func() { + if err != nil { + _ = container.UnmountVolumes(daemon.LogVolumeEvent) + } + }() + + // Setup in initial mount namespace complete. We're ready to unshare the + // mount namespace and bind the volume mounts into that private view of + // the container FS. + todo := make(chan future) + done := make(chan error) + err = unshare.Go(unix.CLONE_NEWNS, + func() error { + if err := mount.MakeRSlave("/"); err != nil { + return err + } + for _, m := range mounts { + dest, err := container.GetResourcePath(m.Destination) + if err != nil { + return err + } + + var stat os.FileInfo + stat, err = os.Stat(m.Source) + if err != nil { + return err + } + if err := fileutils.CreateIfNotExists(dest, stat.IsDir()); err != nil { + return err + } + + bindMode := "rbind" + if m.NonRecursive { + bindMode = "bind" + } + writeMode := "ro" + if m.Writable { + writeMode = "rw" + } + + // openContainerFS() is called for temporary mounts + // outside the container. Soon these will be unmounted + // with lazy unmount option and given we have mounted + // them rbind, all the submounts will propagate if these + // are shared. If daemon is running in host namespace + // and has / as shared then these unmounts will + // propagate and unmount original mount as well. So make + // all these mounts rprivate. Do not use propagation + // property of volume as that should apply only when + // mounting happens inside the container. + opts := strings.Join([]string{bindMode, writeMode, "rprivate"}, ",") + if err := mount.Mount(m.Source, dest, "", opts); err != nil { + return err + } + } + + return mounttree.SwitchRoot(container.BaseFS) + }, + func() { + defer close(done) + + for it := range todo { + err := it.fn() + if it.res != nil { + it.res <- err + } + } + + // The thread will terminate when this goroutine returns, taking the + // mount namespace and all the volume bind-mounts with it. + }, + ) + if err != nil { + return nil, err + } + vw := &containerFSView{ + d: daemon, + ctr: container, + todo: todo, + done: done, + } + runtime.SetFinalizer(vw, (*containerFSView).Close) + return vw, nil +} + +// RunInFS synchronously runs fn in the context of the container filesytem and +// passes through its return value. +// +// The container filesystem is only visible to functions called in the same +// goroutine as fn. Goroutines started from fn will see the host's filesystem. +func (vw *containerFSView) RunInFS(ctx context.Context, fn func() error) error { + res := make(chan error) + select { + case vw.todo <- future{fn: fn, res: res}: + case <-ctx.Done(): + return ctx.Err() + } + return <-res +} + +// GoInFS starts fn in the container FS. It blocks until fn is started but does +// not wait until fn returns. An error is returned if ctx is canceled before fn +// has been started. +// +// The container filesystem is only visible to functions called in the same +// goroutine as fn. Goroutines started from fn will see the host's filesystem. +func (vw *containerFSView) GoInFS(ctx context.Context, fn func()) error { + select { + case vw.todo <- future{fn: func() error { fn(); return nil }}: + return nil + case <-ctx.Done(): + return ctx.Err() + } +} + +// Close waits until any in-flight operations complete and frees all +// resources associated with vw. +func (vw *containerFSView) Close() error { + runtime.SetFinalizer(vw, nil) + close(vw.todo) + err := multierror.Append(nil, <-vw.done) + err = multierror.Append(err, vw.ctr.UnmountVolumes(vw.d.LogVolumeEvent)) + err = multierror.Append(err, vw.d.Unmount(vw.ctr)) + return err.ErrorOrNil() +} + +// Stat returns the metadata for path, relative to the current working directory +// of vw inside the container filesystem view. +func (vw *containerFSView) Stat(ctx context.Context, path string) (*types.ContainerPathStat, error) { + var stat *types.ContainerPathStat + err := vw.RunInFS(ctx, func() error { + lstat, err := os.Lstat(path) + if err != nil { + return err + } + var target string + if lstat.Mode()&os.ModeSymlink != 0 { + // Fully evaluate symlinks along path to the ultimate + // target, or as much as possible with broken links. + target, err = symlink.FollowSymlinkInScope(path, "/") + if err != nil { + return err + } + } + stat = &types.ContainerPathStat{ + Name: filepath.Base(path), + Size: lstat.Size(), + Mode: lstat.Mode(), + Mtime: lstat.ModTime(), + LinkTarget: target, + } + return nil + }) + return stat, err +} diff --git a/daemon/volumes_unix.go b/daemon/volumes_unix.go index 59a95c239a..8e63203243 100644 --- a/daemon/volumes_unix.go +++ b/daemon/volumes_unix.go @@ -12,9 +12,7 @@ import ( mounttypes "github.com/docker/docker/api/types/mount" "github.com/docker/docker/container" - "github.com/docker/docker/pkg/fileutils" volumemounts "github.com/docker/docker/volume/mounts" - "github.com/moby/sys/mount" ) // setupMounts iterates through each of the mount points for a container and @@ -112,51 +110,3 @@ func setBindModeIfNull(bind *volumemounts.MountPoint) { bind.Mode = "z" } } - -func (daemon *Daemon) mountVolumes(container *container.Container) error { - mounts, err := daemon.setupMounts(container) - if err != nil { - return err - } - - for _, m := range mounts { - dest, err := container.GetResourcePath(m.Destination) - if err != nil { - return err - } - - var stat os.FileInfo - stat, err = os.Stat(m.Source) - if err != nil { - return err - } - if err = fileutils.CreateIfNotExists(dest, stat.IsDir()); err != nil { - return err - } - - bindMode := "rbind" - if m.NonRecursive { - bindMode = "bind" - } - writeMode := "ro" - if m.Writable { - writeMode = "rw" - } - - // mountVolumes() seems to be called for temporary mounts - // outside the container. Soon these will be unmounted with - // lazy unmount option and given we have mounted the rbind, - // all the submounts will propagate if these are shared. If - // daemon is running in host namespace and has / as shared - // then these unmounts will propagate and unmount original - // mount as well. So make all these mounts rprivate. - // Do not use propagation property of volume as that should - // apply only when mounting happens inside the container. - opts := strings.Join([]string{bindMode, writeMode, "rprivate"}, ",") - if err := mount.Mount(m.Source, dest, "", opts); err != nil { - return err - } - } - - return nil -}