From 0071832226b42dbb12bc01bf1c668dca891b2d33 Mon Sep 17 00:00:00 2001 From: Rob Murray Date: Tue, 2 Apr 2024 15:11:02 +0100 Subject: [PATCH] Add per-endpoint sysctls to DriverOpts Until now it's been possible to set per-interface sysctls using, for example, '--sysctl net.ipv6.conf.eth0.accept_ra=2'. But, the index in the interface name is allocated serially, and the numbering in a container with more than one interface may change when a container is restarted. The change to make it possible to connect a container to more than one network when it's created increased the ambiguity. This change adds label "com.docker.network.endpoint.sysctls" to the DriverOpts in EndpointSettings. This option is explicitly associated with the interface. Settings in "--sysctl" for "eth0" are migrated to DriverOpts. Because using "--sysctl" with any interface apart from "eth0" would have unpredictable results, it is now an error to use any other interface name in the top level "--sysctl" option. The error message includes a hint at how to use the new per-interface setting. The per-endpoint sysctl name has the interface name replaced by "IFNAME". For example: net.ipv6.conf.eth0.accept_ra=2 becomes: net.ipv6.conf.IFNAME.accept_ra=2 The value of DriverOpts["com.docker.network.endpoint.sysctls"] is a comma separated list. Settings from '--sysctl' are applied by the runtime lib during task creation. So, task creation fails if the endpoint does not exist. Applying per-endpoint settings during interface configuration means the endpoint can be created later, which paves the way for removal of the SetKey OCI prestart hook. Unlike other DriverOpts, the sysctl label itself is not driver-specific, but each driver has a chance to check settings/values and raise an error if a setting would cause it a problem - no such checks have been added in this initial version. As a future extension, if required, it would be possible for the driver to echo back valid/extended/modified settings to libnetwork for it to apply to the interface. (At that point, the syntax for the options could become driver specific to allow, for example, a driver to create more than one interface). Signed-off-by: Rob Murray --- .../router/container/container_routes.go | 95 ++++++++++++++ .../router/container/container_routes_test.go | 120 ++++++++++++++++++ daemon/container_operations.go | 15 +++ docs/api/version-history.md | 12 ++ integration/networking/bridge_test.go | 37 ++++++ libnetwork/endpoint.go | 13 ++ libnetwork/netlabel/labels.go | 4 + libnetwork/osl/interface_linux.go | 50 +++++++- libnetwork/osl/options_linux.go | 8 ++ libnetwork/sandbox_linux.go | 3 + 10 files changed, 355 insertions(+), 2 deletions(-) diff --git a/api/server/router/container/container_routes.go b/api/server/router/container/container_routes.go index 7191d291c8..97ccda3734 100644 --- a/api/server/router/container/container_routes.go +++ b/api/server/router/container/container_routes.go @@ -23,6 +23,7 @@ import ( "github.com/docker/docker/api/types/versions" containerpkg "github.com/docker/docker/container" "github.com/docker/docker/errdefs" + "github.com/docker/docker/libnetwork/netlabel" "github.com/docker/docker/pkg/ioutils" "github.com/docker/docker/runconfig" ocispec "github.com/opencontainers/image-spec/specs-go/v1" @@ -619,6 +620,12 @@ func (s *containerRouter) postContainersCreate(ctx context.Context, w http.Respo warnings = append(warnings, warn) } + if warn, err := handleSysctlBC(hostConfig, networkingConfig, version); err != nil { + return err + } else if warn != "" { + warnings = append(warnings, warn) + } + if hostConfig.PidsLimit != nil && *hostConfig.PidsLimit <= 0 { // Don't set a limit if either no limit was specified, or "unlimited" was // explicitly set. @@ -694,6 +701,94 @@ func handleMACAddressBC(config *container.Config, hostConfig *container.HostConf return warning, nil } +// handleSysctlBC migrates top level network endpoint-specific '--sysctl' +// settings to an DriverOpts for an endpoint. This is necessary because sysctls +// are applied during container task creation, but sysctls that name an interface +// (for example 'net.ipv6.conf.eth0.forwarding') cannot be applied until the +// interface has been created. So, these settings are removed from hostConfig.Sysctls +// and added to DriverOpts[netlabel.EndpointSysctls]. +// +// Because interface names ('ethN') are allocated sequentially, and the order of +// network connections is not deterministic on container restart, only 'eth0' +// would work reliably in a top-level '--sysctl' option, and then only when +// there's a single initial network connection. So, settings for 'eth0' are +// migrated to the primary interface, identified by 'hostConfig.NetworkMode'. +// Settings for other interfaces are treated as errors. +// +// In the DriverOpts, because the interface name cannot be determined in advance, the +// interface name is replaced by "IFNAME". For example, 'net.ipv6.conf.eth0.forwarding' +// becomes 'net.ipv6.conf.IFNAME.forwarding'. The value in DriverOpts is a +// comma-separated list. +// +// A warning is generated when settings are migrated. +func handleSysctlBC( + hostConfig *container.HostConfig, + netConfig *network.NetworkingConfig, + version string, +) (string, error) { + if !hostConfig.NetworkMode.IsPrivate() { + return "", nil + } + + var ep *network.EndpointSettings + var toDelete []string + var netIfSysctls []string + for k, v := range hostConfig.Sysctls { + // If the sysctl name matches "net.*.*.eth0.*" ... + if spl := strings.SplitN(k, ".", 5); len(spl) == 5 && spl[0] == "net" && strings.HasPrefix(spl[3], "eth") { + netIfSysctl := fmt.Sprintf("net.%s.%s.IFNAME.%s=%s", spl[1], spl[2], spl[4], v) + // Find the EndpointConfig to migrate settings to, if not already found. + if ep == nil { + // Per-endpoint sysctls were introduced in API version 1.46. Migration is + // needed, but refuse to do it automatically for newer versions of the API. + if versions.GreaterThan(version, "1.46") { + return "", fmt.Errorf("interface specific sysctl setting %q must be supplied using driver option '%s'", + k, netlabel.EndpointSysctls) + } + var err error + ep, err = epConfigForNetMode(version, hostConfig.NetworkMode, netConfig) + if err != nil { + return "", fmt.Errorf("unable to find a network for sysctl %s: %w", k, err) + } + } + // Only try to migrate settings for "eth0", anything else would always + // have behaved unpredictably. + if spl[3] != "eth0" { + return "", fmt.Errorf(`unable to determine network endpoint for sysctl %s, use driver option '%s' to set per-interface sysctls`, + k, netlabel.EndpointSysctls) + } + // Prepare the migration. + toDelete = append(toDelete, k) + netIfSysctls = append(netIfSysctls, netIfSysctl) + } + } + if ep == nil { + return "", nil + } + + newDriverOpt := strings.Join(netIfSysctls, ",") + warning := fmt.Sprintf(`Migrated sysctl %q to DriverOpts{%q:%q}.`, + strings.Join(toDelete, ","), + netlabel.EndpointSysctls, newDriverOpt) + + // Append existing per-endpoint sysctls to the migrated sysctls (give priority + // to per-endpoint settings). + if ep.DriverOpts == nil { + ep.DriverOpts = map[string]string{} + } + if oldDriverOpt, ok := ep.DriverOpts[netlabel.EndpointSysctls]; ok { + newDriverOpt += "," + oldDriverOpt + } + ep.DriverOpts[netlabel.EndpointSysctls] = newDriverOpt + + // Delete migrated settings from the top-level sysctls. + for _, k := range toDelete { + delete(hostConfig.Sysctls, k) + } + + return warning, nil +} + // epConfigForNetMode finds, or creates, an entry in netConfig.EndpointsConfig // corresponding to nwMode. // diff --git a/api/server/router/container/container_routes_test.go b/api/server/router/container/container_routes_test.go index 7c70e1c01f..dd31ca2ab2 100644 --- a/api/server/router/container/container_routes_test.go +++ b/api/server/router/container/container_routes_test.go @@ -1,10 +1,12 @@ package container import ( + "strings" "testing" "github.com/docker/docker/api/types/container" "github.com/docker/docker/api/types/network" + "github.com/docker/docker/libnetwork/netlabel" "gotest.tools/v3/assert" is "gotest.tools/v3/assert/cmp" ) @@ -228,3 +230,121 @@ func TestEpConfigForNetMode(t *testing.T) { }) } } + +func TestHandleSysctlBC(t *testing.T) { + testcases := []struct { + name string + apiVersion string + networkMode string + sysctls map[string]string + epConfig map[string]*network.EndpointSettings + expEpSysctls []string + expSysctls map[string]string + expWarningContains []string + expError string + }{ + { + name: "migrate to new ep", + apiVersion: "1.46", + networkMode: "mynet", + sysctls: map[string]string{ + "net.ipv6.conf.all.disable_ipv6": "0", + "net.ipv6.conf.eth0.accept_ra": "2", + "net.ipv6.conf.eth0.forwarding": "1", + }, + expSysctls: map[string]string{ + "net.ipv6.conf.all.disable_ipv6": "0", + }, + expEpSysctls: []string{"net.ipv6.conf.IFNAME.forwarding=1", "net.ipv6.conf.IFNAME.accept_ra=2"}, + expWarningContains: []string{ + "Migrated", + "net.ipv6.conf.eth0.accept_ra", "net.ipv6.conf.IFNAME.accept_ra=2", + "net.ipv6.conf.eth0.forwarding", "net.ipv6.conf.IFNAME.forwarding=1", + }, + }, + { + name: "migrate nothing", + apiVersion: "1.46", + networkMode: "mynet", + sysctls: map[string]string{ + "net.ipv6.conf.all.disable_ipv6": "0", + }, + expSysctls: map[string]string{ + "net.ipv6.conf.all.disable_ipv6": "0", + }, + }, + { + name: "migration disabled for newer api", + apiVersion: "1.47", + networkMode: "mynet", + sysctls: map[string]string{ + "net.ipv6.conf.eth0.accept_ra": "2", + }, + expError: "must be supplied using driver option 'com.docker.network.endpoint.sysctls'", + }, + { + name: "only migrate eth0", + apiVersion: "1.46", + networkMode: "mynet", + sysctls: map[string]string{ + "net.ipv6.conf.eth1.accept_ra": "2", + }, + expError: "unable to determine network endpoint", + }, + { + name: "net name mismatch", + apiVersion: "1.46", + networkMode: "mynet", + epConfig: map[string]*network.EndpointSettings{ + "shortid": {EndpointID: "epone"}, + }, + sysctls: map[string]string{ + "net.ipv6.conf.eth1.accept_ra": "2", + }, + expError: "unable to find a network for sysctl", + }, + } + + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + hostCfg := &container.HostConfig{ + NetworkMode: container.NetworkMode(tc.networkMode), + Sysctls: map[string]string{}, + } + for k, v := range tc.sysctls { + hostCfg.Sysctls[k] = v + } + netCfg := &network.NetworkingConfig{ + EndpointsConfig: tc.epConfig, + } + + warnings, err := handleSysctlBC(hostCfg, netCfg, tc.apiVersion) + + for _, s := range tc.expWarningContains { + assert.Check(t, is.Contains(warnings, s)) + } + + if tc.expError != "" { + assert.Check(t, is.ErrorContains(err, tc.expError)) + } else { + assert.Check(t, err) + + assert.Check(t, is.DeepEqual(hostCfg.Sysctls, tc.expSysctls)) + + ep := netCfg.EndpointsConfig[tc.networkMode] + if ep == nil { + assert.Check(t, is.Nil(tc.expEpSysctls)) + } else { + got, ok := ep.DriverOpts[netlabel.EndpointSysctls] + assert.Check(t, ok) + // Check for expected ep-sysctls. + for _, want := range tc.expEpSysctls { + assert.Check(t, is.Contains(got, want)) + } + // Check for unexpected ep-sysctls. + assert.Check(t, is.Len(got, len(strings.Join(tc.expEpSysctls, ",")))) + } + } + }) + } +} diff --git a/daemon/container_operations.go b/daemon/container_operations.go index 86da02f172..f6e4ff55f2 100644 --- a/daemon/container_operations.go +++ b/daemon/container_operations.go @@ -615,6 +615,21 @@ func validateEndpointSettings(nw *libnetwork.Network, nwName string, epConfig *n } } + if sysctls, ok := epConfig.DriverOpts[netlabel.EndpointSysctls]; ok { + for _, sysctl := range strings.Split(sysctls, ",") { + scname := strings.SplitN(sysctl, ".", 5) + // Allow "ifname" as well as "IFNAME", because the CLI converts to lower case. + if len(scname) != 5 || + (scname[1] != "ipv4" && scname[1] != "ipv6" && scname[1] != "mpls") || + (scname[3] != "IFNAME" && scname[3] != "ifname") { + errs = append(errs, + fmt.Errorf( + "unrecognised network interface sysctl '%s'; represent 'net.X.Y.ethN.Z=V' as 'net.X.Y.IFNAME.Z=V', 'X' must be 'ipv4', 'ipv6' or 'mpls'", + sysctl)) + } + } + } + if err := multierror.Join(errs...); err != nil { return fmt.Errorf("invalid endpoint settings:\n%w", err) } diff --git a/docs/api/version-history.md b/docs/api/version-history.md index 2a1fc4fd93..de05d33d63 100644 --- a/docs/api/version-history.md +++ b/docs/api/version-history.md @@ -15,6 +15,18 @@ keywords: "API, Docker, rcli, REST, documentation" ## v1.46 API changes +[Docker Engine API v1.46](https://docs.docker.com/engine/api/v1.46/) documentation + +* `POST /containers/create` field `NetworkingConfig.EndpointsConfig.DriverOpts`, + and `POST /networks/{id}/connect` field `EndpointsConfig.DriverOpts`, now + support label `com.docker.network.endpoint.sysctls` for setting per-interface + sysctls. The value is a comma separated list of sysctl assignments, the + interface name must be "IFNAME". For example, to set + `net.ipv4.config.eth0.log_martians=1`, use + `net.ipv4.config.IFNAME.log_martians=1`. In API versions up-to 1.46, top level + `--sysctl` settings for `eth0` will be migrated to `DriverOpts` when possible. + This automatic migration will be removed for API versions 1.47 and greater. + ## v1.45 API changes [Docker Engine API v1.45](https://docs.docker.com/engine/api/v1.45/) documentation diff --git a/integration/networking/bridge_test.go b/integration/networking/bridge_test.go index cb2abfaf66..908e21b29e 100644 --- a/integration/networking/bridge_test.go +++ b/integration/networking/bridge_test.go @@ -10,8 +10,10 @@ import ( "github.com/docker/docker/api/types" containertypes "github.com/docker/docker/api/types/container" + apinetwork "github.com/docker/docker/api/types/network" "github.com/docker/docker/integration/internal/container" "github.com/docker/docker/integration/internal/network" + "github.com/docker/docker/libnetwork/netlabel" "github.com/docker/docker/testutil" "github.com/docker/docker/testutil/daemon" "github.com/google/go-cmp/cmp/cmpopts" @@ -828,3 +830,38 @@ func TestReadOnlySlashProc(t *testing.T) { }) } } + +// Test that it's possible to set a sysctl on an interface in the container +// using DriverOpts. +func TestSetEndpointSysctl(t *testing.T) { + skip.If(t, testEnv.DaemonInfo.OSType == "windows", "no sysctl on Windows") + + ctx := setupTest(t) + d := daemon.New(t) + d.StartWithBusybox(ctx, t) + defer d.Stop(t) + + c := d.NewClientT(t) + defer c.Close() + + const scName = "net.ipv4.conf.eth0.forwarding" + for _, ifname := range []string{"IFNAME", "ifname"} { + for _, val := range []string{"0", "1"} { + t.Run("ifname="+ifname+"/val="+val, func(t *testing.T) { + ctx := testutil.StartSpan(ctx, t) + runRes := container.RunAttach(ctx, t, c, + container.WithCmd("sysctl", "-qn", scName), + container.WithEndpointSettings(apinetwork.NetworkBridge, &apinetwork.EndpointSettings{ + DriverOpts: map[string]string{ + netlabel.EndpointSysctls: "net.ipv4.conf." + ifname + ".forwarding=" + val, + }, + }), + ) + defer c.ContainerRemove(ctx, runRes.ContainerID, containertypes.RemoveOptions{Force: true}) + + stdout := runRes.Stdout.String() + assert.Check(t, is.Equal(strings.TrimSpace(stdout), val)) + }) + } + } +} diff --git a/libnetwork/endpoint.go b/libnetwork/endpoint.go index 836313ccd3..1ee235f1dc 100644 --- a/libnetwork/endpoint.go +++ b/libnetwork/endpoint.go @@ -8,6 +8,7 @@ import ( "encoding/json" "fmt" "net" + "strings" "sync" "github.com/containerd/log" @@ -401,6 +402,18 @@ func (ep *Endpoint) Value() []byte { return b } +func (ep *Endpoint) getSysctls() []string { + ep.mu.Lock() + defer ep.mu.Unlock() + + if s, ok := ep.generic[netlabel.EndpointSysctls]; ok { + if ss, ok := s.(string); ok { + return strings.Split(ss, ",") + } + } + return nil +} + func (ep *Endpoint) SetValue(value []byte) error { return json.Unmarshal(value, ep) } diff --git a/libnetwork/netlabel/labels.go b/libnetwork/netlabel/labels.go index 91232af6aa..5e243b8520 100644 --- a/libnetwork/netlabel/labels.go +++ b/libnetwork/netlabel/labels.go @@ -26,6 +26,10 @@ const ( // DNSServers A list of DNS servers associated with the endpoint DNSServers = Prefix + ".endpoint.dnsservers" + // EndpointSysctls is a comma separated list interface-specific sysctls + // where the interface name is represented by the string "IFNAME". + EndpointSysctls = Prefix + ".endpoint.sysctls" + // EnableIPv6 constant represents enabling IPV6 at network level EnableIPv6 = Prefix + ".enable_ipv6" diff --git a/libnetwork/osl/interface_linux.go b/libnetwork/osl/interface_linux.go index 3491aac70c..1dec44dfd4 100644 --- a/libnetwork/osl/interface_linux.go +++ b/libnetwork/osl/interface_linux.go @@ -4,12 +4,16 @@ import ( "context" "fmt" "net" + "os" + "path/filepath" + "strings" "syscall" "time" "github.com/containerd/log" "github.com/docker/docker/libnetwork/ns" "github.com/docker/docker/libnetwork/types" + "github.com/pkg/errors" "github.com/vishvananda/netlink" "github.com/vishvananda/netns" ) @@ -55,6 +59,7 @@ type Interface struct { llAddrs []*net.IPNet routes []*net.IPNet bridge bool + sysctls []string ns *Namespace } @@ -223,7 +228,7 @@ func (n *Namespace) AddInterface(srcName, dstPrefix string, options ...IfaceOpti } // Configure the interface now this is moved in the proper namespace. - if err := configureInterface(nlh, iface, i); err != nil { + if err := n.configureInterface(nlh, iface, i); err != nil { // If configuring the device fails move it back to the host namespace // and change the name back to the source name. This allows the caller // to properly cleanup the interface. Its important especially for @@ -312,7 +317,7 @@ func (n *Namespace) RemoveInterface(i *Interface) error { return nil } -func configureInterface(nlh *netlink.Handle, iface netlink.Link, i *Interface) error { +func (n *Namespace) configureInterface(nlh *netlink.Handle, iface netlink.Link, i *Interface) error { ifaceName := iface.Attrs().Name ifaceConfigurators := []struct { Fn func(*netlink.Handle, netlink.Link, *Interface) error @@ -331,6 +336,11 @@ func configureInterface(nlh *netlink.Handle, iface netlink.Link, i *Interface) e return fmt.Errorf("%s: %v", config.ErrMessage, err) } } + + if err := n.setSysctls(i.dstName, i.sysctls); err != nil { + return err + } + return nil } @@ -393,6 +403,42 @@ func setInterfaceLinkLocalIPs(nlh *netlink.Handle, iface netlink.Link, i *Interf return nil } +func (n *Namespace) setSysctls(ifName string, sysctls []string) error { + for _, sc := range sysctls { + k, v, found := strings.Cut(sc, "=") + if !found { + return fmt.Errorf("expected sysctl '%s' to have format name=value", sc) + } + sk := strings.Split(k, ".") + if len(sk) != 5 { + return fmt.Errorf("expected sysctl '%s' to have format net.X.Y.IFNAME.Z", sc) + } + + sysPath := filepath.Join(append([]string{"/proc/sys", sk[0], sk[1], sk[2], ifName}, sk[4:]...)...) + var errF error + f := func() { + if fi, err := os.Stat(sysPath); err != nil || !fi.Mode().IsRegular() { + errF = fmt.Errorf("%s is not a sysctl file", sysPath) + } else if curVal, err := os.ReadFile(sysPath); err != nil { + errF = errors.Wrapf(err, "unable to read '%s'", sysPath) + } else if strings.TrimSpace(string(curVal)) == v { + // The value is already correct, don't try to write the file in case + // "/proc/sys/net" is a read-only filesystem. + } else if err := os.WriteFile(sysPath, []byte(v), 0o644); err != nil { + errF = errors.Wrapf(err, "unable to write to '%s'", sysPath) + } + } + + if err := n.InvokeFunc(f); err != nil { + return errors.Wrapf(err, "failed to run sysctl setter in network namespace") + } + if errF != nil { + return errF + } + } + return nil +} + func setInterfaceName(nlh *netlink.Handle, iface netlink.Link, i *Interface) error { return nlh.LinkSetName(iface, i.DstName()) } diff --git a/libnetwork/osl/options_linux.go b/libnetwork/osl/options_linux.go index bc617c7b0a..f2e668f242 100644 --- a/libnetwork/osl/options_linux.go +++ b/libnetwork/osl/options_linux.go @@ -81,3 +81,11 @@ func WithRoutes(routes []*net.IPNet) IfaceOption { return nil } } + +// WithSysctls sets the interface sysctls. +func WithSysctls(sysctls []string) IfaceOption { + return func(i *Interface) error { + i.sysctls = sysctls + return nil + } +} diff --git a/libnetwork/sandbox_linux.go b/libnetwork/sandbox_linux.go index c34e96cce6..3cae9b9904 100644 --- a/libnetwork/sandbox_linux.go +++ b/libnetwork/sandbox_linux.go @@ -315,6 +315,9 @@ func (sb *Sandbox) populateNetworkResources(ep *Endpoint) error { if i.mac != nil { ifaceOptions = append(ifaceOptions, osl.WithMACAddress(i.mac)) } + if sysctls := ep.getSysctls(); len(sysctls) > 0 { + ifaceOptions = append(ifaceOptions, osl.WithSysctls(sysctls)) + } if err := sb.osSbox.AddInterface(i.srcName, i.dstPrefix, ifaceOptions...); err != nil { return fmt.Errorf("failed to add interface %s to sandbox: %v", i.srcName, err)