mirror of
https://github.com/moby/moby.git
synced 2026-01-11 18:51:37 +00:00
Move internal/unshare to daemon/internal/unshare
Signed-off-by: Derek McGowan <derek@mcg.dev>
This commit is contained in:
@@ -1,175 +0,0 @@
|
||||
//go:build go1.10
|
||||
|
||||
package unshare
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"runtime"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
func init() {
|
||||
// The startup thread of a process is special in a few different ways.
|
||||
// Most pertinent to the discussion at hand, any per-thread kernel state
|
||||
// reflected in the /proc/[pid]/ directory for a process is taken from
|
||||
// the state of the startup thread. Same goes for /proc/self/; it shows
|
||||
// the state of the current process' startup thread, no matter which
|
||||
// thread the files are being opened from. For most programs this is a
|
||||
// distinction without a difference as the kernel state, such as the
|
||||
// mount namespace and current working directory, is shared among (and
|
||||
// kept synchronized across) all threads of a process. But things start
|
||||
// to break down once threads start unsharing and modifying parts of
|
||||
// their kernel state.
|
||||
//
|
||||
// The Go runtime schedules goroutines to execute on the startup thread,
|
||||
// same as any other. How this could be problematic is best illustrated
|
||||
// with a concrete example. Consider what happens if a call to
|
||||
// Go(unix.CLONE_NEWNS, ...) spawned a goroutine which gets scheduled
|
||||
// onto the startup thread. The thread's mount namespace will be
|
||||
// unshared and modified. The contents of the /proc/[pid]/mountinfo file
|
||||
// will then describe the mount tree of the unshared namespace, not the
|
||||
// namespace of any other thread. It will remain this way until the
|
||||
// process exits. (The startup thread is special in another way: exiting
|
||||
// it puts the process into a "non-waitable zombie" state. To avoid this
|
||||
// fate, the Go runtime parks the thread instead of exiting if a
|
||||
// goroutine returns while locked to the startup thread. More
|
||||
// information can be found in the Go runtime sources:
|
||||
// `go doc -u -src runtime.mexit`.) The github.com/moby/sys/mountinfo
|
||||
// package reads from /proc/self/mountinfo, so will read the mount tree
|
||||
// for the wrong namespace if the startup thread has had its mount
|
||||
// namespace unshared! The /proc/thread-self/ directory, introduced in
|
||||
// Linux 3.17, is one potential solution to this problem, but every
|
||||
// package which opens files in /proc/self/ would need to be updated,
|
||||
// and fallbacks to /proc/self/task/[tid]/ would be required to support
|
||||
// older kernels. Overlooking any reference to /proc/self/ would
|
||||
// manifest as stochastically-reproducible bugs, so this is far from an
|
||||
// ideal solution.
|
||||
//
|
||||
// Reading from /proc/self/ would not be a problem if we could prevent
|
||||
// the per-thread state of the startup thread from being modified
|
||||
// nondeterministically in the first place. We can accomplish this
|
||||
// simply by locking the main() function to the startup thread! Doing so
|
||||
// excludes any other goroutine from being scheduled on the thread.
|
||||
runtime.LockOSThread()
|
||||
}
|
||||
|
||||
// reversibleSetnsFlags maps the unshare(2) flags whose effects can be fully
|
||||
// reversed using setns(2). The values are the basenames of the corresponding
|
||||
// /proc/self/task/[tid]/ns/ magic symlinks to use to save and restore the
|
||||
// state.
|
||||
var reversibleSetnsFlags = map[int]string{
|
||||
unix.CLONE_NEWCGROUP: "cgroup",
|
||||
unix.CLONE_NEWNET: "net",
|
||||
unix.CLONE_NEWUTS: "uts",
|
||||
unix.CLONE_NEWPID: "pid",
|
||||
unix.CLONE_NEWTIME: "time",
|
||||
|
||||
// The following CLONE_NEW* flags are not included because they imply
|
||||
// another, irreversible flag when used with unshare(2).
|
||||
// - unix.CLONE_NEWIPC: implies CLONE_SYSVMEM
|
||||
// - unix.CLONE_NEWNS: implies CLONE_FS
|
||||
// - unix.CLONE_NEWUSER: implies CLONE_FS since Linux 3.9
|
||||
}
|
||||
|
||||
// Go calls the given functions in a new goroutine, locked to an OS thread,
|
||||
// which has had the parts of its execution state disassociated from the rest of
|
||||
// the current process using [unshare(2)]. It blocks until the new goroutine has
|
||||
// started and setupfn has returned. fn is only called if setupfn returns nil. A
|
||||
// nil setupfn or fn is equivalent to passing a no-op function.
|
||||
//
|
||||
// The disassociated execution state and any changes made to it are only visible
|
||||
// to the goroutine which the functions are called in. Any other goroutines,
|
||||
// including ones started from the function, will see the same execution state
|
||||
// as the rest of the process.
|
||||
//
|
||||
// The acceptable flags are documented in the [unshare(2)] Linux man-page.
|
||||
// The corresponding CLONE_* constants are defined in package [unix].
|
||||
//
|
||||
// # Warning
|
||||
//
|
||||
// This function may terminate the thread which the new goroutine executed on
|
||||
// after fn returns, which could cause subprocesses started with the
|
||||
// [syscall.SysProcAttr] Pdeathsig field set to be signaled before process
|
||||
// termination. Any subprocess started before this function is called may be
|
||||
// affected, in addition to any subprocesses started inside setupfn or fn.
|
||||
// There are more details at https://go.dev/issue/27505.
|
||||
//
|
||||
// [unshare(2)]: https://man7.org/linux/man-pages/man2/unshare.2.html
|
||||
func Go(flags int, setupfn func() error, fn func()) error {
|
||||
started := make(chan error)
|
||||
|
||||
maskedFlags := flags
|
||||
for f := range reversibleSetnsFlags {
|
||||
maskedFlags &^= f
|
||||
}
|
||||
isReversible := maskedFlags == 0
|
||||
|
||||
go func() {
|
||||
// Prepare to manipulate per-thread kernel state.
|
||||
runtime.LockOSThread()
|
||||
|
||||
// Not all changes to the execution state can be reverted.
|
||||
// If an irreversible change to the execution state is made, our
|
||||
// only recourse is to have the tampered thread terminated by
|
||||
// returning from this function while the goroutine remains
|
||||
// wired to the thread. The Go runtime will terminate the thread
|
||||
// and replace it with a fresh one as needed.
|
||||
|
||||
if isReversible {
|
||||
defer func() {
|
||||
if isReversible {
|
||||
// All execution state has been restored without error.
|
||||
// The thread is once again fungible.
|
||||
runtime.UnlockOSThread()
|
||||
}
|
||||
}()
|
||||
tid := unix.Gettid()
|
||||
for f, ns := range reversibleSetnsFlags {
|
||||
if flags&f != f {
|
||||
continue
|
||||
}
|
||||
// The /proc/thread-self directory was added in Linux 3.17.
|
||||
// We are not using it to maximize compatibility.
|
||||
pth := fmt.Sprintf("/proc/self/task/%d/ns/%s", tid, ns)
|
||||
fd, err := unix.Open(pth, unix.O_RDONLY|unix.O_CLOEXEC, 0)
|
||||
if err != nil {
|
||||
started <- &os.PathError{Op: "open", Path: pth, Err: err}
|
||||
return
|
||||
}
|
||||
defer func() {
|
||||
if isReversible {
|
||||
if err := unix.Setns(fd, 0); err != nil {
|
||||
isReversible = false
|
||||
}
|
||||
}
|
||||
_ = unix.Close(fd)
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
// Threads are implemented under Linux as processes which share
|
||||
// a virtual memory space. Therefore in a multithreaded process
|
||||
// unshare(2) disassociates parts of the calling thread's
|
||||
// context from the thread it was clone(2)'d from.
|
||||
if err := unix.Unshare(flags); err != nil {
|
||||
started <- os.NewSyscallError("unshare", err)
|
||||
return
|
||||
}
|
||||
|
||||
if setupfn != nil {
|
||||
if err := setupfn(); err != nil {
|
||||
started <- err
|
||||
return
|
||||
}
|
||||
}
|
||||
close(started)
|
||||
|
||||
if fn != nil {
|
||||
fn()
|
||||
}
|
||||
}()
|
||||
|
||||
return <-started
|
||||
}
|
||||
Reference in New Issue
Block a user