Move oci to daemon/pkg/oci

Signed-off-by: Derek McGowan <derek@mcg.dev>
This commit is contained in:
Derek McGowan
2025-07-24 12:11:19 -07:00
parent f24455c90b
commit c74ba95583
26 changed files with 14 additions and 14 deletions

View File

@@ -0,0 +1,21 @@
package caps
// DefaultCapabilities returns a Linux kernel default capabilities
func DefaultCapabilities() []string {
return []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
}
}

View File

@@ -0,0 +1,127 @@
package caps
import (
"fmt"
"strings"
"github.com/docker/docker/errdefs"
)
var (
allCaps []string
// knownCapabilities is a map of all known capabilities, using capability
// name as index. Nil values indicate that the capability is known, but either
// not supported by the Kernel, or not available in the current environment,
// for example, when running Docker-in-Docker with restricted capabilities.
//
// Capabilities are one of the security systems in Linux Security Module (LSM)
// framework provided by the kernel.
// For more details on capabilities, see http://man7.org/linux/man-pages/man7/capabilities.7.html
knownCaps map[string]*struct{}
)
// GetAllCapabilities returns all capabilities that are available in the current
// environment.
func GetAllCapabilities() []string {
initCaps()
return allCaps
}
// knownCapabilities returns a map of all known capabilities, using capability
// name as index. Nil values indicate that the capability is known, but either
// not supported by the Kernel, or not available in the current environment, for
// example, when running Docker-in-Docker with restricted capabilities.
func knownCapabilities() map[string]*struct{} {
initCaps()
return knownCaps
}
// inSlice tests whether a string is contained in a slice of strings or not.
func inSlice(slice []string, s string) bool {
for _, ss := range slice {
if s == ss {
return true
}
}
return false
}
const allCapabilities = "ALL"
// NormalizeLegacyCapabilities normalizes, and validates CapAdd/CapDrop capabilities
// by upper-casing them, and adding a CAP_ prefix (if not yet present).
//
// This function also accepts the "ALL" magic-value, that's used by CapAdd/CapDrop.
func NormalizeLegacyCapabilities(caps []string) ([]string, error) {
var (
normalized []string
capabilityList = knownCapabilities()
)
for _, c := range caps {
c = strings.ToUpper(c)
if c == allCapabilities {
normalized = append(normalized, c)
continue
}
if !strings.HasPrefix(c, "CAP_") {
c = "CAP_" + c
}
if v, ok := capabilityList[c]; !ok {
return nil, errdefs.InvalidParameter(fmt.Errorf("unknown capability: %q", c))
} else if v == nil {
return nil, errdefs.InvalidParameter(fmt.Errorf("capability not supported by your kernel or not available in the current environment: %q", c))
}
normalized = append(normalized, c)
}
return normalized, nil
}
// TweakCapabilities tweaks capabilities by adding, dropping, or overriding
// capabilities in the basics capabilities list. All capabilities are added
// if privileged is true.
func TweakCapabilities(basics, adds, drops []string, privileged bool) ([]string, error) {
switch {
case privileged:
// Privileged containers get all capabilities
return GetAllCapabilities(), nil
case len(adds) == 0 && len(drops) == 0:
// Nothing to tweak; we're done
return basics, nil
}
capDrop, err := NormalizeLegacyCapabilities(drops)
if err != nil {
return nil, err
}
capAdd, err := NormalizeLegacyCapabilities(adds)
if err != nil {
return nil, err
}
var caps []string
switch {
case inSlice(capAdd, allCapabilities):
// Add all capabilities except ones on capDrop
for _, c := range GetAllCapabilities() {
if !inSlice(capDrop, c) {
caps = append(caps, c)
}
}
case inSlice(capDrop, allCapabilities):
// "Drop" all capabilities; use what's in capAdd instead
caps = capAdd
default:
// First drop some capabilities
for _, c := range basics {
if !inSlice(capDrop, c) {
caps = append(caps, c)
}
}
// Then add the list of capabilities from capAdd
caps = append(caps, capAdd...)
}
return caps, nil
}

View File

@@ -0,0 +1,37 @@
package caps
import (
"context"
"sync"
ccaps "github.com/containerd/containerd/v2/pkg/cap"
"github.com/containerd/log"
)
var initCapsOnce sync.Once
func initCaps() {
initCapsOnce.Do(func() {
rawCaps := ccaps.Known()
curCaps, err := ccaps.Current()
if err != nil {
log.G(context.TODO()).WithError(err).Error("failed to get capabilities from current environment")
allCaps = rawCaps
} else {
allCaps = curCaps
}
knownCaps = make(map[string]*struct{}, len(rawCaps))
for _, capName := range rawCaps {
// For now, we assume the capability is available if we failed to
// get the capabilities from the current environment. This keeps the
// old (pre-detection) behavior, and prevents creating containers with
// no capabilities. The OCI runtime or kernel may still refuse capa-
// bilities that are not available, and produce an error in that case.
if len(curCaps) > 0 && !inSlice(curCaps, capName) {
knownCaps[capName] = nil
continue
}
knownCaps[capName] = &struct{}{}
}
})
}

View File

@@ -0,0 +1,7 @@
//go:build !linux
package caps
func initCaps() {
// no capabilities on Windows
}

221
daemon/pkg/oci/defaults.go Normal file
View File

@@ -0,0 +1,221 @@
// TODO(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23
package oci
import (
"fmt"
"os"
"runtime"
"sync"
"github.com/docker/docker/daemon/pkg/oci/caps"
"github.com/docker/docker/internal/platform"
"github.com/opencontainers/runtime-spec/specs-go"
)
func iPtr(i int64) *int64 { return &i }
const defaultUnixPathEnv = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
// DefaultPathEnv is unix style list of directories to search for
// executables. Each directory is separated from the next by a colon
// ':' character .
// For Windows containers, an empty string is returned as the default
// path will be set by the container, and Docker has no context of what the
// default path should be.
//
// TODO(thaJeztah) align Windows default with BuildKit; see https://github.com/moby/buildkit/pull/1747
// TODO(thaJeztah) use defaults from containerd (but align it with BuildKit; see https://github.com/moby/buildkit/pull/1747)
func DefaultPathEnv(os string) string {
if os == "windows" {
return ""
}
return defaultUnixPathEnv
}
// DefaultSpec returns the default spec used by docker for the current Platform
func DefaultSpec() specs.Spec {
if runtime.GOOS == "windows" {
return DefaultWindowsSpec()
}
return DefaultLinuxSpec()
}
// DefaultWindowsSpec create a default spec for running Windows containers
func DefaultWindowsSpec() specs.Spec {
return specs.Spec{
Version: specs.Version,
Windows: &specs.Windows{},
Process: &specs.Process{},
Root: &specs.Root{},
}
}
// DefaultLinuxSpec create a default spec for running Linux containers
func DefaultLinuxSpec() specs.Spec {
return specs.Spec{
Version: specs.Version,
Process: &specs.Process{
Capabilities: &specs.LinuxCapabilities{
Bounding: caps.DefaultCapabilities(),
Permitted: caps.DefaultCapabilities(),
Effective: caps.DefaultCapabilities(),
},
},
Root: &specs.Root{},
Mounts: []specs.Mount{
{
Destination: "/proc",
Type: "proc",
Source: "proc",
Options: []string{"nosuid", "noexec", "nodev"},
},
{
Destination: "/dev",
Type: "tmpfs",
Source: "tmpfs",
Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"},
},
{
Destination: "/dev/pts",
Type: "devpts",
Source: "devpts",
Options: []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"},
},
{
Destination: "/sys",
Type: "sysfs",
Source: "sysfs",
Options: []string{"nosuid", "noexec", "nodev", "ro"},
},
{
Destination: "/sys/fs/cgroup",
Type: "cgroup",
Source: "cgroup",
Options: []string{"ro", "nosuid", "noexec", "nodev"},
},
{
Destination: "/dev/mqueue",
Type: "mqueue",
Source: "mqueue",
Options: []string{"nosuid", "noexec", "nodev"},
},
{
Destination: "/dev/shm",
Type: "tmpfs",
Source: "shm",
Options: []string{"nosuid", "noexec", "nodev", "mode=1777"},
},
},
Linux: &specs.Linux{
MaskedPaths: defaultLinuxMaskedPaths(),
ReadonlyPaths: []string{
"/proc/bus",
"/proc/fs",
"/proc/irq",
"/proc/sys",
"/proc/sysrq-trigger",
},
Namespaces: []specs.LinuxNamespace{
{Type: specs.MountNamespace},
{Type: specs.NetworkNamespace},
{Type: specs.UTSNamespace},
{Type: specs.PIDNamespace},
{Type: specs.IPCNamespace},
},
// Devices implicitly contains the following devices:
// null, zero, full, random, urandom, tty, console, and ptmx.
// ptmx is a bind mount or symlink of the container's ptmx.
// See also: https://github.com/opencontainers/runtime-spec/blob/master/config-linux.md#default-devices
Devices: []specs.LinuxDevice{},
Resources: &specs.LinuxResources{
Devices: []specs.LinuxDeviceCgroup{
{
Allow: false,
Access: "rwm",
},
{
Allow: true,
Type: "c",
Major: iPtr(1),
Minor: iPtr(5),
Access: "rwm",
},
{
Allow: true,
Type: "c",
Major: iPtr(1),
Minor: iPtr(3),
Access: "rwm",
},
{
Allow: true,
Type: "c",
Major: iPtr(1),
Minor: iPtr(9),
Access: "rwm",
},
{
Allow: true,
Type: "c",
Major: iPtr(1),
Minor: iPtr(8),
Access: "rwm",
},
{
Allow: true,
Type: "c",
Major: iPtr(5),
Minor: iPtr(0),
Access: "rwm",
},
{
Allow: true,
Type: "c",
Major: iPtr(5),
Minor: iPtr(1),
Access: "rwm",
},
{
Allow: false,
Type: "c",
Major: iPtr(10),
Minor: iPtr(229),
Access: "rwm",
},
},
},
},
}
}
// defaultLinuxMaskedPaths returns the default list of paths to mask in a Linux
// container. The paths won't change while the docker daemon is running, so just
// compute them once.
var defaultLinuxMaskedPaths = sync.OnceValue(func() []string {
maskedPaths := []string{
"/proc/acpi",
"/proc/asound",
"/proc/interrupts", // https://github.com/moby/moby/security/advisories/GHSA-6fw5-f8r9-fgfm
"/proc/kcore",
"/proc/keys",
"/proc/latency_stats",
"/proc/sched_debug",
"/proc/scsi",
"/proc/timer_list",
"/proc/timer_stats",
"/sys/devices/virtual/powercap", // https://github.com/moby/moby/security/advisories/GHSA-jq35-85cj-fj4p
"/sys/firmware",
}
// https://github.com/moby/moby/security/advisories/GHSA-6fw5-f8r9-fgfm
cpus := platform.PossibleCPU()
for _, cpu := range cpus {
path := fmt.Sprintf("/sys/devices/system/cpu/cpu%d/thermal_throttle", cpu)
if _, err := os.Stat(path); err == nil {
maskedPaths = append(maskedPaths, path)
}
}
return maskedPaths
})

View File

@@ -0,0 +1,71 @@
package oci
import (
"errors"
"fmt"
"os"
"path/filepath"
"strings"
coci "github.com/containerd/containerd/v2/pkg/oci"
"github.com/opencontainers/runtime-spec/specs-go"
)
func deviceCgroup(d *specs.LinuxDevice, permissions string) specs.LinuxDeviceCgroup {
return specs.LinuxDeviceCgroup{
Allow: true,
Type: d.Type,
Major: &d.Major,
Minor: &d.Minor,
Access: permissions,
}
}
// DevicesFromPath computes a list of devices and device permissions from paths (pathOnHost and pathInContainer) and cgroup permissions.
func DevicesFromPath(pathOnHost, pathInContainer, cgroupPermissions string) (devs []specs.LinuxDevice, devPermissions []specs.LinuxDeviceCgroup, _ error) {
resolvedPathOnHost := pathOnHost
// check if it is a symbolic link
if src, e := os.Lstat(pathOnHost); e == nil && src.Mode()&os.ModeSymlink == os.ModeSymlink {
if linkedPathOnHost, e := filepath.EvalSymlinks(pathOnHost); e == nil {
resolvedPathOnHost = linkedPathOnHost
}
}
device, err := coci.DeviceFromPath(resolvedPathOnHost)
// if there was no error, return the device
if err == nil {
device.Path = pathInContainer
return append(devs, *device), append(devPermissions, deviceCgroup(device, cgroupPermissions)), nil
}
// if the device is not a device node
// try to see if it's a directory holding many devices
if errors.Is(err, coci.ErrNotADevice) {
// check if it is a directory
if src, e := os.Stat(resolvedPathOnHost); e == nil && src.IsDir() {
// mount the internal devices recursively
// TODO check if additional errors should be handled or logged
_ = filepath.WalkDir(resolvedPathOnHost, func(dpath string, f os.DirEntry, _ error) error {
childDevice, e := coci.DeviceFromPath(dpath)
if e != nil {
// ignore the device
return nil
}
// add the device to userSpecified devices
childDevice.Path = strings.Replace(dpath, resolvedPathOnHost, pathInContainer, 1)
devs = append(devs, *childDevice)
devPermissions = append(devPermissions, deviceCgroup(childDevice, cgroupPermissions))
return nil
})
}
}
if len(devs) > 0 {
return devs, devPermissions, nil
}
return devs, devPermissions, fmt.Errorf("error gathering device information while adding custom device %q: %s", pathOnHost, err)
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,813 @@
{
"defaultAction": "SCMP_ACT_ERRNO",
"archMap": [
{
"architecture": "SCMP_ARCH_X86_64",
"subArchitectures": [
"SCMP_ARCH_X86",
"SCMP_ARCH_X32"
]
},
{
"architecture": "SCMP_ARCH_AARCH64",
"subArchitectures": [
"SCMP_ARCH_ARM"
]
},
{
"architecture": "SCMP_ARCH_MIPS64",
"subArchitectures": [
"SCMP_ARCH_MIPS",
"SCMP_ARCH_MIPS64N32"
]
},
{
"architecture": "SCMP_ARCH_MIPS64N32",
"subArchitectures": [
"SCMP_ARCH_MIPS",
"SCMP_ARCH_MIPS64"
]
},
{
"architecture": "SCMP_ARCH_MIPSEL64",
"subArchitectures": [
"SCMP_ARCH_MIPSEL",
"SCMP_ARCH_MIPSEL64N32"
]
},
{
"architecture": "SCMP_ARCH_MIPSEL64N32",
"subArchitectures": [
"SCMP_ARCH_MIPSEL",
"SCMP_ARCH_MIPSEL64"
]
},
{
"architecture": "SCMP_ARCH_S390X",
"subArchitectures": [
"SCMP_ARCH_S390"
]
}
],
"syscalls": [
{
"names": [
"accept",
"accept4",
"access",
"adjtimex",
"alarm",
"bind",
"brk",
"capget",
"capset",
"chdir",
"chmod",
"chown",
"chown32",
"clock_getres",
"clock_getres_time64",
"clock_gettime",
"clock_gettime64",
"clock_nanosleep",
"clock_nanosleep_time64",
"close",
"connect",
"copy_file_range",
"creat",
"dup",
"dup2",
"dup3",
"epoll_create",
"epoll_create1",
"epoll_ctl",
"epoll_ctl_old",
"epoll_pwait",
"epoll_wait",
"epoll_wait_old",
"eventfd",
"eventfd2",
"execve",
"execveat",
"exit",
"exit_group",
"faccessat",
"fadvise64",
"fadvise64_64",
"fallocate",
"fanotify_mark",
"fchdir",
"fchmod",
"fchmodat",
"fchown",
"fchown32",
"fchownat",
"fcntl",
"fcntl64",
"fdatasync",
"fgetxattr",
"flistxattr",
"flock",
"fork",
"fremovexattr",
"fsetxattr",
"fstat",
"fstat64",
"fstatat64",
"fstatfs",
"fstatfs64",
"fsync",
"ftruncate",
"ftruncate64",
"futex",
"futex_time64",
"futimesat",
"getcpu",
"getcwd",
"getdents",
"getdents64",
"getegid",
"getegid32",
"geteuid",
"geteuid32",
"getgid",
"getgid32",
"getgroups",
"getgroups32",
"getitimer",
"getpeername",
"getpgid",
"getpgrp",
"getpid",
"getppid",
"getpriority",
"getrandom",
"getresgid",
"getresgid32",
"getresuid",
"getresuid32",
"getrlimit",
"get_robust_list",
"getrusage",
"getsid",
"getsockname",
"getsockopt",
"get_thread_area",
"gettid",
"gettimeofday",
"getuid",
"getuid32",
"getxattr",
"inotify_add_watch",
"inotify_init",
"inotify_init1",
"inotify_rm_watch",
"io_cancel",
"ioctl",
"io_destroy",
"io_getevents",
"io_pgetevents",
"io_pgetevents_time64",
"ioprio_get",
"ioprio_set",
"io_setup",
"io_submit",
"ipc",
"kill",
"lchown",
"lchown32",
"lgetxattr",
"link",
"linkat",
"listen",
"listxattr",
"llistxattr",
"_llseek",
"lremovexattr",
"lseek",
"lsetxattr",
"lstat",
"lstat64",
"madvise",
"memfd_create",
"mincore",
"mkdir",
"mkdirat",
"mknod",
"mknodat",
"mlock",
"mlock2",
"mlockall",
"mmap",
"mmap2",
"mprotect",
"mq_getsetattr",
"mq_notify",
"mq_open",
"mq_timedreceive",
"mq_timedreceive_time64",
"mq_timedsend",
"mq_timedsend_time64",
"mq_unlink",
"mremap",
"msgctl",
"msgget",
"msgrcv",
"msgsnd",
"msync",
"munlock",
"munlockall",
"munmap",
"nanosleep",
"newfstatat",
"_newselect",
"open",
"openat",
"pause",
"pipe",
"pipe2",
"poll",
"ppoll",
"ppoll_time64",
"prctl",
"pread64",
"preadv",
"preadv2",
"prlimit64",
"pselect6",
"pselect6_time64",
"pwrite64",
"pwritev",
"pwritev2",
"read",
"readahead",
"readlink",
"readlinkat",
"readv",
"recv",
"recvfrom",
"recvmmsg",
"recvmmsg_time64",
"recvmsg",
"remap_file_pages",
"removexattr",
"rename",
"renameat",
"renameat2",
"restart_syscall",
"rmdir",
"rt_sigaction",
"rt_sigpending",
"rt_sigprocmask",
"rt_sigqueueinfo",
"rt_sigreturn",
"rt_sigsuspend",
"rt_sigtimedwait",
"rt_sigtimedwait_time64",
"rt_tgsigqueueinfo",
"sched_getaffinity",
"sched_getattr",
"sched_getparam",
"sched_get_priority_max",
"sched_get_priority_min",
"sched_getscheduler",
"sched_rr_get_interval",
"sched_rr_get_interval_time64",
"sched_setaffinity",
"sched_setattr",
"sched_setparam",
"sched_setscheduler",
"sched_yield",
"seccomp",
"select",
"semctl",
"semget",
"semop",
"semtimedop",
"semtimedop_time64",
"send",
"sendfile",
"sendfile64",
"sendmmsg",
"sendmsg",
"sendto",
"setfsgid",
"setfsgid32",
"setfsuid",
"setfsuid32",
"setgid",
"setgid32",
"setgroups",
"setgroups32",
"setitimer",
"setpgid",
"setpriority",
"setregid",
"setregid32",
"setresgid",
"setresgid32",
"setresuid",
"setresuid32",
"setreuid",
"setreuid32",
"setrlimit",
"set_robust_list",
"setsid",
"setsockopt",
"set_thread_area",
"set_tid_address",
"setuid",
"setuid32",
"setxattr",
"shmat",
"shmctl",
"shmdt",
"shmget",
"shutdown",
"sigaltstack",
"signalfd",
"signalfd4",
"sigprocmask",
"sigreturn",
"socket",
"socketcall",
"socketpair",
"splice",
"stat",
"stat64",
"statfs",
"statfs64",
"statx",
"symlink",
"symlinkat",
"sync",
"sync_file_range",
"syncfs",
"sysinfo",
"tee",
"tgkill",
"time",
"timer_create",
"timer_delete",
"timer_getoverrun",
"timer_gettime",
"timer_gettime64",
"timer_settime",
"timer_settime64",
"timerfd_create",
"timerfd_gettime",
"timerfd_gettime64",
"timerfd_settime",
"timerfd_settime64",
"times",
"tkill",
"truncate",
"truncate64",
"ugetrlimit",
"umask",
"uname",
"unlink",
"unlinkat",
"utime",
"utimensat",
"utimensat_time64",
"utimes",
"vfork",
"vmsplice",
"wait4",
"waitid",
"waitpid",
"write",
"writev"
],
"action": "SCMP_ACT_ALLOW",
"args": [],
"comment": "",
"includes": {},
"excludes": {}
},
{
"names": [
"ptrace"
],
"action": "SCMP_ACT_ALLOW",
"args": null,
"comment": "",
"includes": {
"minKernel": "4.8"
},
"excludes": {}
},
{
"names": [
"personality"
],
"action": "SCMP_ACT_ALLOW",
"args": [
{
"index": 0,
"value": 0,
"valueTwo": 0,
"op": "SCMP_CMP_EQ"
}
],
"comment": "",
"includes": {},
"excludes": {}
},
{
"names": [
"personality"
],
"action": "SCMP_ACT_ALLOW",
"args": [
{
"index": 0,
"value": 8,
"valueTwo": 0,
"op": "SCMP_CMP_EQ"
}
],
"comment": "",
"includes": {},
"excludes": {}
},
{
"names": [
"personality"
],
"action": "SCMP_ACT_ALLOW",
"args": [
{
"index": 0,
"value": 131072,
"valueTwo": 0,
"op": "SCMP_CMP_EQ"
}
],
"comment": "",
"includes": {},
"excludes": {}
},
{
"names": [
"personality"
],
"action": "SCMP_ACT_ALLOW",
"args": [
{
"index": 0,
"value": 131080,
"valueTwo": 0,
"op": "SCMP_CMP_EQ"
}
],
"comment": "",
"includes": {},
"excludes": {}
},
{
"names": [
"personality"
],
"action": "SCMP_ACT_ALLOW",
"args": [
{
"index": 0,
"value": 4294967295,
"valueTwo": 0,
"op": "SCMP_CMP_EQ"
}
],
"comment": "",
"includes": {},
"excludes": {}
},
{
"names": [
"sync_file_range2"
],
"action": "SCMP_ACT_ALLOW",
"args": [],
"comment": "",
"includes": {
"arches": [
"ppc64le"
]
},
"excludes": {}
},
{
"names": [
"arm_fadvise64_64",
"arm_sync_file_range",
"sync_file_range2",
"breakpoint",
"cacheflush",
"set_tls"
],
"action": "SCMP_ACT_ALLOW",
"args": [],
"comment": "",
"includes": {
"arches": [
"arm",
"arm64"
]
},
"excludes": {}
},
{
"names": [
"arch_prctl"
],
"action": "SCMP_ACT_ALLOW",
"args": [],
"comment": "",
"includes": {
"arches": [
"amd64",
"x32"
]
},
"excludes": {}
},
{
"names": [
"modify_ldt"
],
"action": "SCMP_ACT_ALLOW",
"args": [],
"comment": "",
"includes": {
"arches": [
"amd64",
"x32",
"x86"
]
},
"excludes": {}
},
{
"names": [
"s390_pci_mmio_read",
"s390_pci_mmio_write",
"s390_runtime_instr"
],
"action": "SCMP_ACT_ALLOW",
"args": [],
"comment": "",
"includes": {
"arches": [
"s390",
"s390x"
]
},
"excludes": {}
},
{
"names": [
"open_by_handle_at"
],
"action": "SCMP_ACT_ALLOW",
"args": [],
"comment": "",
"includes": {
"caps": [
"CAP_DAC_READ_SEARCH"
]
},
"excludes": {}
},
{
"names": [
"bpf",
"clone",
"fanotify_init",
"lookup_dcookie",
"mount",
"name_to_handle_at",
"perf_event_open",
"quotactl",
"setdomainname",
"sethostname",
"setns",
"syslog",
"umount",
"umount2",
"unshare"
],
"action": "SCMP_ACT_ALLOW",
"args": [],
"comment": "",
"includes": {
"caps": [
"CAP_SYS_ADMIN"
]
},
"excludes": {}
},
{
"names": [
"clone"
],
"action": "SCMP_ACT_ALLOW",
"args": [
{
"index": 0,
"value": 2080505856,
"valueTwo": 0,
"op": "SCMP_CMP_MASKED_EQ"
}
],
"comment": "",
"includes": {},
"excludes": {
"caps": [
"CAP_SYS_ADMIN"
],
"arches": [
"s390",
"s390x"
]
}
},
{
"names": [
"clone"
],
"action": "SCMP_ACT_ALLOW",
"args": [
{
"index": 1,
"value": 2080505856,
"valueTwo": 0,
"op": "SCMP_CMP_MASKED_EQ"
}
],
"comment": "s390 parameter ordering for clone is different",
"includes": {
"arches": [
"s390",
"s390x"
]
},
"excludes": {
"caps": [
"CAP_SYS_ADMIN"
]
}
},
{
"names": [
"reboot"
],
"action": "SCMP_ACT_ALLOW",
"args": [],
"comment": "",
"includes": {
"caps": [
"CAP_SYS_BOOT"
]
},
"excludes": {}
},
{
"names": [
"chroot"
],
"action": "SCMP_ACT_ALLOW",
"args": [],
"comment": "",
"includes": {
"caps": [
"CAP_SYS_CHROOT"
]
},
"excludes": {}
},
{
"names": [
"delete_module",
"init_module",
"finit_module",
"query_module"
],
"action": "SCMP_ACT_ALLOW",
"args": [],
"comment": "",
"includes": {
"caps": [
"CAP_SYS_MODULE"
]
},
"excludes": {}
},
{
"names": [
"acct"
],
"action": "SCMP_ACT_ALLOW",
"args": [],
"comment": "",
"includes": {
"caps": [
"CAP_SYS_PACCT"
]
},
"excludes": {}
},
{
"names": [
"kcmp",
"process_vm_readv",
"process_vm_writev",
"ptrace"
],
"action": "SCMP_ACT_ALLOW",
"args": [],
"comment": "",
"includes": {
"caps": [
"CAP_SYS_PTRACE"
]
},
"excludes": {}
},
{
"names": [
"iopl",
"ioperm"
],
"action": "SCMP_ACT_ALLOW",
"args": [],
"comment": "",
"includes": {
"caps": [
"CAP_SYS_RAWIO"
]
},
"excludes": {}
},
{
"names": [
"settimeofday",
"stime",
"clock_settime"
],
"action": "SCMP_ACT_ALLOW",
"args": [],
"comment": "",
"includes": {
"caps": [
"CAP_SYS_TIME"
]
},
"excludes": {}
},
{
"names": [
"vhangup"
],
"action": "SCMP_ACT_ALLOW",
"args": [],
"comment": "",
"includes": {
"caps": [
"CAP_SYS_TTY_CONFIG"
]
},
"excludes": {}
},
{
"names": [
"get_mempolicy",
"mbind",
"set_mempolicy"
],
"action": "SCMP_ACT_ALLOW",
"args": [],
"comment": "",
"includes": {
"caps": [
"CAP_SYS_NICE"
]
},
"excludes": {}
},
{
"names": [
"syslog"
],
"action": "SCMP_ACT_ALLOW",
"args": [],
"comment": "",
"includes": {
"caps": [
"CAP_SYSLOG"
]
},
"excludes": {}
}
]
}

View File

@@ -0,0 +1,27 @@
{
"defaultAction": "SCMP_ACT_ERRNO",
"syscalls": [
{
"name": "clone",
"action": "SCMP_ACT_ALLOW",
"args": [
{
"index": 0,
"value": 2080505856,
"valueTwo": 0,
"op": "SCMP_CMP_MASKED_EQ"
}
]
},
{
"name": "open",
"action": "SCMP_ACT_ALLOW",
"args": []
},
{
"name": "close",
"action": "SCMP_ACT_ALLOW",
"args": []
}
]
}

View File

@@ -0,0 +1,30 @@
package oci
import (
"testing"
fuzz "github.com/AdaLogics/go-fuzz-headers"
"github.com/opencontainers/runtime-spec/specs-go"
)
func FuzzAppendDevicePermissionsFromCgroupRules(f *testing.F) {
f.Fuzz(func(t *testing.T, data []byte) {
ff := fuzz.NewConsumer(data)
sp := make([]specs.LinuxDeviceCgroup, 0)
noOfRecords, err := ff.GetInt()
if err != nil {
return
}
for i := 0; i < noOfRecords%40; i++ {
s := specs.LinuxDeviceCgroup{}
err := ff.GenerateStruct(&s)
if err != nil {
return
}
sp = append(sp, s)
}
rules := make([]string, 0)
ff.CreateSlice(&rules)
_, _ = AppendDevicePermissionsFromCgroupRules(sp, rules)
})
}

View File

@@ -0,0 +1,27 @@
package oci
import "github.com/opencontainers/runtime-spec/specs-go"
// RemoveNamespace removes the `nsType` namespace from OCI spec `s`
func RemoveNamespace(s *specs.Spec, nsType specs.LinuxNamespaceType) {
if s.Linux == nil {
return
}
for i, n := range s.Linux.Namespaces {
if n.Type == nsType {
s.Linux.Namespaces = append(s.Linux.Namespaces[:i], s.Linux.Namespaces[i+1:]...)
return
}
}
}
// NamespacePath returns the configured Path of the first namespace in
// s.Linux.Namespaces of type nsType.
func NamespacePath(s *specs.Spec, nsType specs.LinuxNamespaceType) (path string, ok bool) {
for _, n := range s.Linux.Namespaces {
if n.Type == nsType {
return n.Path, true
}
}
return "", false
}

72
daemon/pkg/oci/oci.go Normal file
View File

@@ -0,0 +1,72 @@
package oci
import (
"fmt"
"strconv"
"github.com/docker/docker/internal/lazyregexp"
"github.com/opencontainers/runtime-spec/specs-go"
)
// TODO verify if this regex is correct for "a" (all);
//
// The docs (https://github.com/torvalds/linux/blob/v5.10/Documentation/admin-guide/cgroup-v1/devices.rst) describe:
// "'all' means it applies to all types and all major and minor numbers", and shows an example
// that *only* passes `a` as value: `echo a > /sys/fs/cgroup/1/devices.allow, which would be
// the "implicit" equivalent of "a *:* rwm". Source-code also looks to confirm this, and returns
// early for "a" (all); https://github.com/torvalds/linux/blob/v5.10/security/device_cgroup.c#L614-L642
var deviceCgroupRuleRegex = lazyregexp.New("^([acb]) ([0-9]+|\\*):([0-9]+|\\*) ([rwm]{1,3})$")
// SetCapabilities sets the provided capabilities on the spec.
//
// Deprecated: this function is no longer used and will be removed in the next release.
func SetCapabilities(s *specs.Spec, caplist []string) error {
if s.Process == nil {
s.Process = &specs.Process{}
}
s.Process.Capabilities = &specs.LinuxCapabilities{
Effective: caplist,
Bounding: caplist,
Permitted: caplist,
}
return nil
}
// AppendDevicePermissionsFromCgroupRules takes rules for the devices cgroup to append to the default set
func AppendDevicePermissionsFromCgroupRules(devPermissions []specs.LinuxDeviceCgroup, rules []string) ([]specs.LinuxDeviceCgroup, error) {
for _, deviceCgroupRule := range rules {
ss := deviceCgroupRuleRegex.FindAllStringSubmatch(deviceCgroupRule, -1)
if len(ss) == 0 || len(ss[0]) != 5 {
return nil, fmt.Errorf("invalid device cgroup rule format: '%s'", deviceCgroupRule)
}
matches := ss[0]
dPermissions := specs.LinuxDeviceCgroup{
Allow: true,
Type: matches[1],
Access: matches[4],
}
if matches[2] == "*" {
major := int64(-1)
dPermissions.Major = &major
} else {
major, err := strconv.ParseInt(matches[2], 10, 64)
if err != nil {
return nil, fmt.Errorf("invalid major value in device cgroup rule format: '%s'", deviceCgroupRule)
}
dPermissions.Major = &major
}
if matches[3] == "*" {
minor := int64(-1)
dPermissions.Minor = &minor
} else {
minor, err := strconv.ParseInt(matches[3], 10, 64)
if err != nil {
return nil, fmt.Errorf("invalid minor value in device cgroup rule format: '%s'", deviceCgroupRule)
}
dPermissions.Minor = &minor
}
devPermissions = append(devPermissions, dPermissions)
}
return devPermissions, nil
}

167
daemon/pkg/oci/oci_test.go Normal file
View File

@@ -0,0 +1,167 @@
package oci
import (
"testing"
"github.com/opencontainers/runtime-spec/specs-go"
"gotest.tools/v3/assert"
)
func TestAppendDevicePermissionsFromCgroupRules(t *testing.T) {
ptr := func(i int64) *int64 { return &i }
tests := []struct {
doc string
rule string
expected specs.LinuxDeviceCgroup
expectedErr string
}{
{
doc: "empty rule",
rule: "",
expectedErr: `invalid device cgroup rule format: ''`,
},
{
doc: "multiple spaces after first column",
rule: "c 1:1 rwm",
expectedErr: `invalid device cgroup rule format: 'c 1:1 rwm'`,
},
{
doc: "multiple spaces after second column",
rule: "c 1:1 rwm",
expectedErr: `invalid device cgroup rule format: 'c 1:1 rwm'`,
},
{
doc: "leading spaces",
rule: " c 1:1 rwm",
expectedErr: `invalid device cgroup rule format: ' c 1:1 rwm'`,
},
{
doc: "trailing spaces",
rule: "c 1:1 rwm ",
expectedErr: `invalid device cgroup rule format: 'c 1:1 rwm '`,
},
{
doc: "unknown device type",
rule: "z 1:1 rwm",
expectedErr: `invalid device cgroup rule format: 'z 1:1 rwm'`,
},
{
doc: "invalid device type",
rule: "zz 1:1 rwm",
expectedErr: `invalid device cgroup rule format: 'zz 1:1 rwm'`,
},
{
doc: "missing colon",
rule: "c 11 rwm",
expectedErr: `invalid device cgroup rule format: 'c 11 rwm'`,
},
{
doc: "invalid device major-minor",
rule: "c a:a rwm",
expectedErr: `invalid device cgroup rule format: 'c a:a rwm'`,
},
{
doc: "negative major device",
rule: "c -1:1 rwm",
expectedErr: `invalid device cgroup rule format: 'c -1:1 rwm'`,
},
{
doc: "negative minor device",
rule: "c 1:-1 rwm",
expectedErr: `invalid device cgroup rule format: 'c 1:-1 rwm'`,
},
{
doc: "missing permissions",
rule: "c 1:1",
expectedErr: `invalid device cgroup rule format: 'c 1:1'`,
},
{
doc: "invalid permissions",
rule: "c 1:1 x",
expectedErr: `invalid device cgroup rule format: 'c 1:1 x'`,
},
{
doc: "too many permissions",
rule: "c 1:1 rwmrwm",
expectedErr: `invalid device cgroup rule format: 'c 1:1 rwmrwm'`,
},
{
doc: "major out of range",
rule: "c 18446744073709551616:1 rwm",
expectedErr: `invalid major value in device cgroup rule format: 'c 18446744073709551616:1 rwm'`,
},
{
doc: "minor out of range",
rule: "c 1:18446744073709551616 rwm",
expectedErr: `invalid minor value in device cgroup rule format: 'c 1:18446744073709551616 rwm'`,
},
{
doc: "all (a) devices",
rule: "a 1:1 rwm",
expected: specs.LinuxDeviceCgroup{Allow: true, Type: "a", Major: ptr(1), Minor: ptr(1), Access: "rwm"},
},
{
doc: "char (c) devices",
rule: "c 1:1 rwm",
expected: specs.LinuxDeviceCgroup{Allow: true, Type: "c", Major: ptr(1), Minor: ptr(1), Access: "rwm"},
},
{
doc: "block (b) devices",
rule: "b 1:1 rwm",
expected: specs.LinuxDeviceCgroup{Allow: true, Type: "b", Major: ptr(1), Minor: ptr(1), Access: "rwm"},
},
{
doc: "char device with rwm permissions",
rule: "c 7:128 rwm",
expected: specs.LinuxDeviceCgroup{Allow: true, Type: "c", Major: ptr(7), Minor: ptr(128), Access: "rwm"},
},
{
doc: "wildcard major",
rule: "c *:1 rwm",
expected: specs.LinuxDeviceCgroup{Allow: true, Type: "c", Major: ptr(-1), Minor: ptr(1), Access: "rwm"},
},
{
doc: "wildcard minor",
rule: "c 1:* rwm",
expected: specs.LinuxDeviceCgroup{Allow: true, Type: "c", Major: ptr(1), Minor: ptr(-1), Access: "rwm"},
},
{
doc: "wildcard major and minor",
rule: "c *:* rwm",
expected: specs.LinuxDeviceCgroup{Allow: true, Type: "c", Major: ptr(-1), Minor: ptr(-1), Access: "rwm"},
},
{
doc: "read (r) permission",
rule: "c 1:1 r",
expected: specs.LinuxDeviceCgroup{Allow: true, Type: "c", Major: ptr(1), Minor: ptr(1), Access: "r"},
},
{
doc: "write (w) permission",
rule: "c 1:1 w",
expected: specs.LinuxDeviceCgroup{Allow: true, Type: "c", Major: ptr(1), Minor: ptr(1), Access: "w"},
},
{
doc: "mknod (m) permission",
rule: "c 1:1 m",
expected: specs.LinuxDeviceCgroup{Allow: true, Type: "c", Major: ptr(1), Minor: ptr(1), Access: "m"},
},
{
doc: "mknod (m) and read (r) permission",
rule: "c 1:1 mr",
expected: specs.LinuxDeviceCgroup{Allow: true, Type: "c", Major: ptr(1), Minor: ptr(1), Access: "mr"},
},
}
for _, tc := range tests {
t.Run(tc.doc, func(t *testing.T) {
out, err := AppendDevicePermissionsFromCgroupRules([]specs.LinuxDeviceCgroup{}, []string{tc.rule})
if tc.expectedErr != "" {
assert.Error(t, err, tc.expectedErr)
return
}
assert.NilError(t, err)
assert.DeepEqual(t, out, []specs.LinuxDeviceCgroup{tc.expected})
})
}
}

View File

@@ -0,0 +1,39 @@
//go:build linux
package oci
import (
"encoding/json"
"os"
"testing"
"github.com/moby/profiles/seccomp"
)
func TestSeccompLoadProfile(t *testing.T) {
profiles := []string{"default.json", "default-old-format.json", "example.json"}
for _, p := range profiles {
t.Run(p, func(t *testing.T) {
f, err := os.ReadFile("fixtures/" + p)
if err != nil {
t.Fatal(err)
}
rs := DefaultLinuxSpec()
if _, err := seccomp.LoadProfile(string(f), &rs); err != nil {
t.Fatal(err)
}
})
}
}
func TestSeccompLoadDefaultProfile(t *testing.T) {
b, err := json.Marshal(seccomp.DefaultProfile())
if err != nil {
t.Fatal(err)
}
rs := DefaultLinuxSpec()
if _, err := seccomp.LoadProfile(string(b), &rs); err != nil {
t.Fatal(err)
}
}

View File

@@ -9,9 +9,9 @@ import (
"runtime"
"strings"
"github.com/docker/docker/daemon/pkg/oci"
"github.com/docker/docker/internal/rootless/mountopts"
"github.com/docker/docker/internal/sliceutil"
"github.com/docker/docker/oci"
"github.com/moby/moby/api/types"
"github.com/moby/sys/userns"
"github.com/opencontainers/runtime-spec/specs-go"