diff --git a/api/server/router/container/container_routes.go b/api/server/router/container/container_routes.go index 7191d291c8..97ccda3734 100644 --- a/api/server/router/container/container_routes.go +++ b/api/server/router/container/container_routes.go @@ -23,6 +23,7 @@ import ( "github.com/docker/docker/api/types/versions" containerpkg "github.com/docker/docker/container" "github.com/docker/docker/errdefs" + "github.com/docker/docker/libnetwork/netlabel" "github.com/docker/docker/pkg/ioutils" "github.com/docker/docker/runconfig" ocispec "github.com/opencontainers/image-spec/specs-go/v1" @@ -619,6 +620,12 @@ func (s *containerRouter) postContainersCreate(ctx context.Context, w http.Respo warnings = append(warnings, warn) } + if warn, err := handleSysctlBC(hostConfig, networkingConfig, version); err != nil { + return err + } else if warn != "" { + warnings = append(warnings, warn) + } + if hostConfig.PidsLimit != nil && *hostConfig.PidsLimit <= 0 { // Don't set a limit if either no limit was specified, or "unlimited" was // explicitly set. @@ -694,6 +701,94 @@ func handleMACAddressBC(config *container.Config, hostConfig *container.HostConf return warning, nil } +// handleSysctlBC migrates top level network endpoint-specific '--sysctl' +// settings to an DriverOpts for an endpoint. This is necessary because sysctls +// are applied during container task creation, but sysctls that name an interface +// (for example 'net.ipv6.conf.eth0.forwarding') cannot be applied until the +// interface has been created. So, these settings are removed from hostConfig.Sysctls +// and added to DriverOpts[netlabel.EndpointSysctls]. +// +// Because interface names ('ethN') are allocated sequentially, and the order of +// network connections is not deterministic on container restart, only 'eth0' +// would work reliably in a top-level '--sysctl' option, and then only when +// there's a single initial network connection. So, settings for 'eth0' are +// migrated to the primary interface, identified by 'hostConfig.NetworkMode'. +// Settings for other interfaces are treated as errors. +// +// In the DriverOpts, because the interface name cannot be determined in advance, the +// interface name is replaced by "IFNAME". For example, 'net.ipv6.conf.eth0.forwarding' +// becomes 'net.ipv6.conf.IFNAME.forwarding'. The value in DriverOpts is a +// comma-separated list. +// +// A warning is generated when settings are migrated. +func handleSysctlBC( + hostConfig *container.HostConfig, + netConfig *network.NetworkingConfig, + version string, +) (string, error) { + if !hostConfig.NetworkMode.IsPrivate() { + return "", nil + } + + var ep *network.EndpointSettings + var toDelete []string + var netIfSysctls []string + for k, v := range hostConfig.Sysctls { + // If the sysctl name matches "net.*.*.eth0.*" ... + if spl := strings.SplitN(k, ".", 5); len(spl) == 5 && spl[0] == "net" && strings.HasPrefix(spl[3], "eth") { + netIfSysctl := fmt.Sprintf("net.%s.%s.IFNAME.%s=%s", spl[1], spl[2], spl[4], v) + // Find the EndpointConfig to migrate settings to, if not already found. + if ep == nil { + // Per-endpoint sysctls were introduced in API version 1.46. Migration is + // needed, but refuse to do it automatically for newer versions of the API. + if versions.GreaterThan(version, "1.46") { + return "", fmt.Errorf("interface specific sysctl setting %q must be supplied using driver option '%s'", + k, netlabel.EndpointSysctls) + } + var err error + ep, err = epConfigForNetMode(version, hostConfig.NetworkMode, netConfig) + if err != nil { + return "", fmt.Errorf("unable to find a network for sysctl %s: %w", k, err) + } + } + // Only try to migrate settings for "eth0", anything else would always + // have behaved unpredictably. + if spl[3] != "eth0" { + return "", fmt.Errorf(`unable to determine network endpoint for sysctl %s, use driver option '%s' to set per-interface sysctls`, + k, netlabel.EndpointSysctls) + } + // Prepare the migration. + toDelete = append(toDelete, k) + netIfSysctls = append(netIfSysctls, netIfSysctl) + } + } + if ep == nil { + return "", nil + } + + newDriverOpt := strings.Join(netIfSysctls, ",") + warning := fmt.Sprintf(`Migrated sysctl %q to DriverOpts{%q:%q}.`, + strings.Join(toDelete, ","), + netlabel.EndpointSysctls, newDriverOpt) + + // Append existing per-endpoint sysctls to the migrated sysctls (give priority + // to per-endpoint settings). + if ep.DriverOpts == nil { + ep.DriverOpts = map[string]string{} + } + if oldDriverOpt, ok := ep.DriverOpts[netlabel.EndpointSysctls]; ok { + newDriverOpt += "," + oldDriverOpt + } + ep.DriverOpts[netlabel.EndpointSysctls] = newDriverOpt + + // Delete migrated settings from the top-level sysctls. + for _, k := range toDelete { + delete(hostConfig.Sysctls, k) + } + + return warning, nil +} + // epConfigForNetMode finds, or creates, an entry in netConfig.EndpointsConfig // corresponding to nwMode. // diff --git a/api/server/router/container/container_routes_test.go b/api/server/router/container/container_routes_test.go index 7c70e1c01f..dd31ca2ab2 100644 --- a/api/server/router/container/container_routes_test.go +++ b/api/server/router/container/container_routes_test.go @@ -1,10 +1,12 @@ package container import ( + "strings" "testing" "github.com/docker/docker/api/types/container" "github.com/docker/docker/api/types/network" + "github.com/docker/docker/libnetwork/netlabel" "gotest.tools/v3/assert" is "gotest.tools/v3/assert/cmp" ) @@ -228,3 +230,121 @@ func TestEpConfigForNetMode(t *testing.T) { }) } } + +func TestHandleSysctlBC(t *testing.T) { + testcases := []struct { + name string + apiVersion string + networkMode string + sysctls map[string]string + epConfig map[string]*network.EndpointSettings + expEpSysctls []string + expSysctls map[string]string + expWarningContains []string + expError string + }{ + { + name: "migrate to new ep", + apiVersion: "1.46", + networkMode: "mynet", + sysctls: map[string]string{ + "net.ipv6.conf.all.disable_ipv6": "0", + "net.ipv6.conf.eth0.accept_ra": "2", + "net.ipv6.conf.eth0.forwarding": "1", + }, + expSysctls: map[string]string{ + "net.ipv6.conf.all.disable_ipv6": "0", + }, + expEpSysctls: []string{"net.ipv6.conf.IFNAME.forwarding=1", "net.ipv6.conf.IFNAME.accept_ra=2"}, + expWarningContains: []string{ + "Migrated", + "net.ipv6.conf.eth0.accept_ra", "net.ipv6.conf.IFNAME.accept_ra=2", + "net.ipv6.conf.eth0.forwarding", "net.ipv6.conf.IFNAME.forwarding=1", + }, + }, + { + name: "migrate nothing", + apiVersion: "1.46", + networkMode: "mynet", + sysctls: map[string]string{ + "net.ipv6.conf.all.disable_ipv6": "0", + }, + expSysctls: map[string]string{ + "net.ipv6.conf.all.disable_ipv6": "0", + }, + }, + { + name: "migration disabled for newer api", + apiVersion: "1.47", + networkMode: "mynet", + sysctls: map[string]string{ + "net.ipv6.conf.eth0.accept_ra": "2", + }, + expError: "must be supplied using driver option 'com.docker.network.endpoint.sysctls'", + }, + { + name: "only migrate eth0", + apiVersion: "1.46", + networkMode: "mynet", + sysctls: map[string]string{ + "net.ipv6.conf.eth1.accept_ra": "2", + }, + expError: "unable to determine network endpoint", + }, + { + name: "net name mismatch", + apiVersion: "1.46", + networkMode: "mynet", + epConfig: map[string]*network.EndpointSettings{ + "shortid": {EndpointID: "epone"}, + }, + sysctls: map[string]string{ + "net.ipv6.conf.eth1.accept_ra": "2", + }, + expError: "unable to find a network for sysctl", + }, + } + + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + hostCfg := &container.HostConfig{ + NetworkMode: container.NetworkMode(tc.networkMode), + Sysctls: map[string]string{}, + } + for k, v := range tc.sysctls { + hostCfg.Sysctls[k] = v + } + netCfg := &network.NetworkingConfig{ + EndpointsConfig: tc.epConfig, + } + + warnings, err := handleSysctlBC(hostCfg, netCfg, tc.apiVersion) + + for _, s := range tc.expWarningContains { + assert.Check(t, is.Contains(warnings, s)) + } + + if tc.expError != "" { + assert.Check(t, is.ErrorContains(err, tc.expError)) + } else { + assert.Check(t, err) + + assert.Check(t, is.DeepEqual(hostCfg.Sysctls, tc.expSysctls)) + + ep := netCfg.EndpointsConfig[tc.networkMode] + if ep == nil { + assert.Check(t, is.Nil(tc.expEpSysctls)) + } else { + got, ok := ep.DriverOpts[netlabel.EndpointSysctls] + assert.Check(t, ok) + // Check for expected ep-sysctls. + for _, want := range tc.expEpSysctls { + assert.Check(t, is.Contains(got, want)) + } + // Check for unexpected ep-sysctls. + assert.Check(t, is.Len(got, len(strings.Join(tc.expEpSysctls, ",")))) + } + } + }) + } +} diff --git a/daemon/container_operations.go b/daemon/container_operations.go index 86da02f172..f6e4ff55f2 100644 --- a/daemon/container_operations.go +++ b/daemon/container_operations.go @@ -615,6 +615,21 @@ func validateEndpointSettings(nw *libnetwork.Network, nwName string, epConfig *n } } + if sysctls, ok := epConfig.DriverOpts[netlabel.EndpointSysctls]; ok { + for _, sysctl := range strings.Split(sysctls, ",") { + scname := strings.SplitN(sysctl, ".", 5) + // Allow "ifname" as well as "IFNAME", because the CLI converts to lower case. + if len(scname) != 5 || + (scname[1] != "ipv4" && scname[1] != "ipv6" && scname[1] != "mpls") || + (scname[3] != "IFNAME" && scname[3] != "ifname") { + errs = append(errs, + fmt.Errorf( + "unrecognised network interface sysctl '%s'; represent 'net.X.Y.ethN.Z=V' as 'net.X.Y.IFNAME.Z=V', 'X' must be 'ipv4', 'ipv6' or 'mpls'", + sysctl)) + } + } + } + if err := multierror.Join(errs...); err != nil { return fmt.Errorf("invalid endpoint settings:\n%w", err) } diff --git a/docs/api/version-history.md b/docs/api/version-history.md index 2a1fc4fd93..de05d33d63 100644 --- a/docs/api/version-history.md +++ b/docs/api/version-history.md @@ -15,6 +15,18 @@ keywords: "API, Docker, rcli, REST, documentation" ## v1.46 API changes +[Docker Engine API v1.46](https://docs.docker.com/engine/api/v1.46/) documentation + +* `POST /containers/create` field `NetworkingConfig.EndpointsConfig.DriverOpts`, + and `POST /networks/{id}/connect` field `EndpointsConfig.DriverOpts`, now + support label `com.docker.network.endpoint.sysctls` for setting per-interface + sysctls. The value is a comma separated list of sysctl assignments, the + interface name must be "IFNAME". For example, to set + `net.ipv4.config.eth0.log_martians=1`, use + `net.ipv4.config.IFNAME.log_martians=1`. In API versions up-to 1.46, top level + `--sysctl` settings for `eth0` will be migrated to `DriverOpts` when possible. + This automatic migration will be removed for API versions 1.47 and greater. + ## v1.45 API changes [Docker Engine API v1.45](https://docs.docker.com/engine/api/v1.45/) documentation diff --git a/integration/networking/bridge_test.go b/integration/networking/bridge_test.go index cb2abfaf66..908e21b29e 100644 --- a/integration/networking/bridge_test.go +++ b/integration/networking/bridge_test.go @@ -10,8 +10,10 @@ import ( "github.com/docker/docker/api/types" containertypes "github.com/docker/docker/api/types/container" + apinetwork "github.com/docker/docker/api/types/network" "github.com/docker/docker/integration/internal/container" "github.com/docker/docker/integration/internal/network" + "github.com/docker/docker/libnetwork/netlabel" "github.com/docker/docker/testutil" "github.com/docker/docker/testutil/daemon" "github.com/google/go-cmp/cmp/cmpopts" @@ -828,3 +830,38 @@ func TestReadOnlySlashProc(t *testing.T) { }) } } + +// Test that it's possible to set a sysctl on an interface in the container +// using DriverOpts. +func TestSetEndpointSysctl(t *testing.T) { + skip.If(t, testEnv.DaemonInfo.OSType == "windows", "no sysctl on Windows") + + ctx := setupTest(t) + d := daemon.New(t) + d.StartWithBusybox(ctx, t) + defer d.Stop(t) + + c := d.NewClientT(t) + defer c.Close() + + const scName = "net.ipv4.conf.eth0.forwarding" + for _, ifname := range []string{"IFNAME", "ifname"} { + for _, val := range []string{"0", "1"} { + t.Run("ifname="+ifname+"/val="+val, func(t *testing.T) { + ctx := testutil.StartSpan(ctx, t) + runRes := container.RunAttach(ctx, t, c, + container.WithCmd("sysctl", "-qn", scName), + container.WithEndpointSettings(apinetwork.NetworkBridge, &apinetwork.EndpointSettings{ + DriverOpts: map[string]string{ + netlabel.EndpointSysctls: "net.ipv4.conf." + ifname + ".forwarding=" + val, + }, + }), + ) + defer c.ContainerRemove(ctx, runRes.ContainerID, containertypes.RemoveOptions{Force: true}) + + stdout := runRes.Stdout.String() + assert.Check(t, is.Equal(strings.TrimSpace(stdout), val)) + }) + } + } +} diff --git a/libnetwork/endpoint.go b/libnetwork/endpoint.go index 836313ccd3..1ee235f1dc 100644 --- a/libnetwork/endpoint.go +++ b/libnetwork/endpoint.go @@ -8,6 +8,7 @@ import ( "encoding/json" "fmt" "net" + "strings" "sync" "github.com/containerd/log" @@ -401,6 +402,18 @@ func (ep *Endpoint) Value() []byte { return b } +func (ep *Endpoint) getSysctls() []string { + ep.mu.Lock() + defer ep.mu.Unlock() + + if s, ok := ep.generic[netlabel.EndpointSysctls]; ok { + if ss, ok := s.(string); ok { + return strings.Split(ss, ",") + } + } + return nil +} + func (ep *Endpoint) SetValue(value []byte) error { return json.Unmarshal(value, ep) } diff --git a/libnetwork/netlabel/labels.go b/libnetwork/netlabel/labels.go index 91232af6aa..5e243b8520 100644 --- a/libnetwork/netlabel/labels.go +++ b/libnetwork/netlabel/labels.go @@ -26,6 +26,10 @@ const ( // DNSServers A list of DNS servers associated with the endpoint DNSServers = Prefix + ".endpoint.dnsservers" + // EndpointSysctls is a comma separated list interface-specific sysctls + // where the interface name is represented by the string "IFNAME". + EndpointSysctls = Prefix + ".endpoint.sysctls" + // EnableIPv6 constant represents enabling IPV6 at network level EnableIPv6 = Prefix + ".enable_ipv6" diff --git a/libnetwork/osl/interface_linux.go b/libnetwork/osl/interface_linux.go index 3491aac70c..1dec44dfd4 100644 --- a/libnetwork/osl/interface_linux.go +++ b/libnetwork/osl/interface_linux.go @@ -4,12 +4,16 @@ import ( "context" "fmt" "net" + "os" + "path/filepath" + "strings" "syscall" "time" "github.com/containerd/log" "github.com/docker/docker/libnetwork/ns" "github.com/docker/docker/libnetwork/types" + "github.com/pkg/errors" "github.com/vishvananda/netlink" "github.com/vishvananda/netns" ) @@ -55,6 +59,7 @@ type Interface struct { llAddrs []*net.IPNet routes []*net.IPNet bridge bool + sysctls []string ns *Namespace } @@ -223,7 +228,7 @@ func (n *Namespace) AddInterface(srcName, dstPrefix string, options ...IfaceOpti } // Configure the interface now this is moved in the proper namespace. - if err := configureInterface(nlh, iface, i); err != nil { + if err := n.configureInterface(nlh, iface, i); err != nil { // If configuring the device fails move it back to the host namespace // and change the name back to the source name. This allows the caller // to properly cleanup the interface. Its important especially for @@ -312,7 +317,7 @@ func (n *Namespace) RemoveInterface(i *Interface) error { return nil } -func configureInterface(nlh *netlink.Handle, iface netlink.Link, i *Interface) error { +func (n *Namespace) configureInterface(nlh *netlink.Handle, iface netlink.Link, i *Interface) error { ifaceName := iface.Attrs().Name ifaceConfigurators := []struct { Fn func(*netlink.Handle, netlink.Link, *Interface) error @@ -331,6 +336,11 @@ func configureInterface(nlh *netlink.Handle, iface netlink.Link, i *Interface) e return fmt.Errorf("%s: %v", config.ErrMessage, err) } } + + if err := n.setSysctls(i.dstName, i.sysctls); err != nil { + return err + } + return nil } @@ -393,6 +403,42 @@ func setInterfaceLinkLocalIPs(nlh *netlink.Handle, iface netlink.Link, i *Interf return nil } +func (n *Namespace) setSysctls(ifName string, sysctls []string) error { + for _, sc := range sysctls { + k, v, found := strings.Cut(sc, "=") + if !found { + return fmt.Errorf("expected sysctl '%s' to have format name=value", sc) + } + sk := strings.Split(k, ".") + if len(sk) != 5 { + return fmt.Errorf("expected sysctl '%s' to have format net.X.Y.IFNAME.Z", sc) + } + + sysPath := filepath.Join(append([]string{"/proc/sys", sk[0], sk[1], sk[2], ifName}, sk[4:]...)...) + var errF error + f := func() { + if fi, err := os.Stat(sysPath); err != nil || !fi.Mode().IsRegular() { + errF = fmt.Errorf("%s is not a sysctl file", sysPath) + } else if curVal, err := os.ReadFile(sysPath); err != nil { + errF = errors.Wrapf(err, "unable to read '%s'", sysPath) + } else if strings.TrimSpace(string(curVal)) == v { + // The value is already correct, don't try to write the file in case + // "/proc/sys/net" is a read-only filesystem. + } else if err := os.WriteFile(sysPath, []byte(v), 0o644); err != nil { + errF = errors.Wrapf(err, "unable to write to '%s'", sysPath) + } + } + + if err := n.InvokeFunc(f); err != nil { + return errors.Wrapf(err, "failed to run sysctl setter in network namespace") + } + if errF != nil { + return errF + } + } + return nil +} + func setInterfaceName(nlh *netlink.Handle, iface netlink.Link, i *Interface) error { return nlh.LinkSetName(iface, i.DstName()) } diff --git a/libnetwork/osl/options_linux.go b/libnetwork/osl/options_linux.go index bc617c7b0a..f2e668f242 100644 --- a/libnetwork/osl/options_linux.go +++ b/libnetwork/osl/options_linux.go @@ -81,3 +81,11 @@ func WithRoutes(routes []*net.IPNet) IfaceOption { return nil } } + +// WithSysctls sets the interface sysctls. +func WithSysctls(sysctls []string) IfaceOption { + return func(i *Interface) error { + i.sysctls = sysctls + return nil + } +} diff --git a/libnetwork/sandbox_linux.go b/libnetwork/sandbox_linux.go index c34e96cce6..3cae9b9904 100644 --- a/libnetwork/sandbox_linux.go +++ b/libnetwork/sandbox_linux.go @@ -315,6 +315,9 @@ func (sb *Sandbox) populateNetworkResources(ep *Endpoint) error { if i.mac != nil { ifaceOptions = append(ifaceOptions, osl.WithMACAddress(i.mac)) } + if sysctls := ep.getSysctls(); len(sysctls) > 0 { + ifaceOptions = append(ifaceOptions, osl.WithSysctls(sysctls)) + } if err := sb.osSbox.AddInterface(i.srcName, i.dstPrefix, ifaceOptions...); err != nil { return fmt.Errorf("failed to add interface %s to sandbox: %v", i.srcName, err)