From 27adcd596baff0d365d58c495c3b5e5e1d3bb54a Mon Sep 17 00:00:00 2001 From: Albin Kerouanton Date: Wed, 22 Jan 2025 08:42:37 +0100 Subject: [PATCH] libnet/d/bridge: port mappings: drop direct-access when gw_mode=nat When a NAT-based port mapping is created, the daemon adds a DNAT rule in nat-DOCKER to replace the dest addr with the container IP. However, the daemon never sets up rules to filter packets destined directly to the container port. This allows a rogue neighbor (ie. a host that shares a L2 segment with the host) to send packets directly to the container on its container-side exposed port. For instance, if container port 5000 is mapped to host port 6000, a neighbor could send packets directly to the container on its port 5000. Since nat-DOCKER mangles the dest addr, and the nat table forbids DROP rules, this change adds a new rule in the raw-PREROUTING chain to filter ingress connections targeting the container's IP address. This filtering is only done when gw_mode=nat. For the unprotected variant, no filtering is done. Signed-off-by: Albin Kerouanton --- Dockerfile | 1 + integration/internal/network/ops.go | 16 ++ .../iptablesdoc/generated/usernet-portmap.md | 26 ++- .../iptablesdoc/iptablesdoc_linux_test.go | 4 + .../iptablesdoc/templates/usernet-portmap.md | 17 +- .../networking/port_mapping_linux_test.go | 204 +++++++++++++++++- .../testutils/networking/l3_segment_linux.go | 9 +- .../drivers/bridge/port_mapping_linux.go | 46 +++- libnetwork/iptables/iptables.go | 2 + 9 files changed, 314 insertions(+), 11 deletions(-) diff --git a/Dockerfile b/Dockerfile index edff43953e..735e1e4a37 100644 --- a/Dockerfile +++ b/Dockerfile @@ -542,6 +542,7 @@ RUN --mount=type=cache,sharing=locked,id=moby-dev-aptlib,target=/var/lib/apt \ libprotobuf-c1 \ libyajl2 \ net-tools \ + netcat-openbsd \ patch \ pigz \ sudo \ diff --git a/integration/internal/network/ops.go b/integration/internal/network/ops.go index b5a071c222..78b7cf0455 100644 --- a/integration/internal/network/ops.go +++ b/integration/internal/network/ops.go @@ -27,6 +27,22 @@ func WithIPv6() func(*network.CreateOptions) { } } +// WithIPv4Disabled makes sure IPv4 is disabled on the network. +func WithIPv4Disabled() func(*network.CreateOptions) { + return func(n *network.CreateOptions) { + enable := false + n.EnableIPv4 = &enable + } +} + +// WithIPv6Disabled makes sure IPv6 is disabled on the network. +func WithIPv6Disabled() func(*network.CreateOptions) { + return func(n *network.CreateOptions) { + enable := false + n.EnableIPv6 = &enable + } +} + // WithInternal enables Internal flag on the create network request func WithInternal() func(*network.CreateOptions) { return func(n *network.CreateOptions) { diff --git a/integration/network/bridge/iptablesdoc/generated/usernet-portmap.md b/integration/network/bridge/iptablesdoc/generated/usernet-portmap.md index a9bb5af3bc..d86c5b522c 100644 --- a/integration/network/bridge/iptablesdoc/generated/usernet-portmap.md +++ b/integration/network/bridge/iptablesdoc/generated/usernet-portmap.md @@ -92,7 +92,7 @@ Note that: [1]: https://github.com/moby/moby/blob/675c2ac2db93e38bb9c5a6615d4155a969535fd9/libnetwork/drivers/bridge/port_mapping_linux.go#L795 [2]: https://github.com/robmry/moby/blob/52c89d467fc5326149e4bbb8903d23589b66ff0d/libnetwork/drivers/bridge/setup_ip_tables_linux.go#L252 -And the corresponding nat table: +The corresponding nat table: Chain PREROUTING (policy ACCEPT 0 packets, 0 bytes) num pkts bytes target prot opt in out source destination @@ -135,3 +135,27 @@ And the corresponding nat table: + +And the raw table: + + Chain PREROUTING (policy ACCEPT 0 packets, 0 bytes) + num pkts bytes target prot opt in out source destination + 1 0 0 DROP 6 -- !bridge1 * 0.0.0.0/0 192.0.2.2 tcp dpt:80 + + Chain OUTPUT (policy ACCEPT 0 packets, 0 bytes) + num pkts bytes target prot opt in out source destination + + +
+iptables commands + + -P PREROUTING ACCEPT + -P OUTPUT ACCEPT + -A PREROUTING -d 192.0.2.2/32 ! -i bridge1 -p tcp -m tcp --dport 80 -j DROP + + +
+ +[filterDirectAccess][3] adds a DROP rule to the raw-PREROUTING chain to block direct remote access to the mapped port. + +[3]: https://github.com/search?q=repo%3Amoby%2Fmoby%20filterDirectAccess&type=code diff --git a/integration/network/bridge/iptablesdoc/iptablesdoc_linux_test.go b/integration/network/bridge/iptablesdoc/iptablesdoc_linux_test.go index 122896a8a7..13c1758a73 100644 --- a/integration/network/bridge/iptablesdoc/iptablesdoc_linux_test.go +++ b/integration/network/bridge/iptablesdoc/iptablesdoc_linux_test.go @@ -188,6 +188,8 @@ const ( iptCmdSFilterDocker4 iptCmdType = "SFilterDocker4" iptCmdLNat4 iptCmdType = "LNat4" iptCmdSNat4 iptCmdType = "SNat4" + iptCmdLRaw4 iptCmdType = "LRaw4" + iptCmdSRaw4 iptCmdType = "SRaw4" ) var iptCmds = map[iptCmdType][]string{ @@ -198,6 +200,8 @@ var iptCmds = map[iptCmdType][]string{ iptCmdSFilterDocker4: {"iptables", "-S", "DOCKER"}, iptCmdLNat4: {"iptables", "-nvL", "--line-numbers", "-t", "nat"}, iptCmdSNat4: {"iptables", "-S", "-t", "nat"}, + iptCmdLRaw4: {"iptables", "-nvL", "--line-numbers", "-t", "raw"}, + iptCmdSRaw4: {"iptables", "-S", "-t", "raw"}, } func TestBridgeIptablesDoc(t *testing.T) { diff --git a/integration/network/bridge/iptablesdoc/templates/usernet-portmap.md b/integration/network/bridge/iptablesdoc/templates/usernet-portmap.md index c7e348c527..62b7669069 100644 --- a/integration/network/bridge/iptablesdoc/templates/usernet-portmap.md +++ b/integration/network/bridge/iptablesdoc/templates/usernet-portmap.md @@ -37,7 +37,7 @@ Note that: [1]: https://github.com/moby/moby/blob/675c2ac2db93e38bb9c5a6615d4155a969535fd9/libnetwork/drivers/bridge/port_mapping_linux.go#L795 [2]: https://github.com/robmry/moby/blob/52c89d467fc5326149e4bbb8903d23589b66ff0d/libnetwork/drivers/bridge/setup_ip_tables_linux.go#L252 -And the corresponding nat table: +The corresponding nat table: {{index . "LNat4"}} @@ -47,3 +47,18 @@ And the corresponding nat table: {{index . "SNat4"}} + +And the raw table: + + {{index . "LRaw4"}} + +
+iptables commands + + {{index . "SRaw4"}} + +
+ +[filterDirectAccess][3] adds a DROP rule to the raw-PREROUTING chain to block direct remote access to the mapped port. + +[3]: https://github.com/search?q=repo%3Amoby%2Fmoby%20filterDirectAccess&type=code diff --git a/integration/networking/port_mapping_linux_test.go b/integration/networking/port_mapping_linux_test.go index 345a8f59d8..94d7b98b38 100644 --- a/integration/networking/port_mapping_linux_test.go +++ b/integration/networking/port_mapping_linux_test.go @@ -728,6 +728,11 @@ func TestDirectRoutingOpenPorts(t *testing.T) { "nat-unprotected": pingSuccess, "routed": pingSuccess, } + expMappedPortHTTP := map[string]string{ + "nat": httpFail, + "nat-unprotected": httpSuccess, + "routed": httpSuccess, + } expUnmappedPortHTTP := map[string]string{ "nat": httpFail, "nat-unprotected": httpSuccess, @@ -769,13 +774,13 @@ func TestDirectRoutingOpenPorts(t *testing.T) { testPing(t, "ping6", networks[gwMode].ipv6, expPingExit[gwMode]) }) t.Run(gwMode+"/v4/http/80", func(t *testing.T) { - testHttp(t, networks[gwMode].ipv4, "80", httpSuccess) + testHttp(t, networks[gwMode].ipv4, "80", expMappedPortHTTP[gwMode]) }) t.Run(gwMode+"/v4/http/81", func(t *testing.T) { testHttp(t, networks[gwMode].ipv4, "81", expUnmappedPortHTTP[gwMode]) }) t.Run(gwMode+"/v6/http/80", func(t *testing.T) { - testHttp(t, networks[gwMode].ipv6, "80", httpSuccess) + testHttp(t, networks[gwMode].ipv6, "80", expMappedPortHTTP[gwMode]) }) t.Run(gwMode+"/v6/http/81", func(t *testing.T) { testHttp(t, networks[gwMode].ipv6, "81", expUnmappedPortHTTP[gwMode]) @@ -874,6 +879,201 @@ func TestAccessPublishedPortFromAnotherNetwork(t *testing.T) { } } +// TestDirectRemoteAccessOnExposedPort checks that remote hosts can't directly +// reach a container on one of its exposed port. That is, if container has the +// IP address 172.17.24.2, and its port 443 is exposed on the host, no remote +// host should be able to reach 172.17.24.2:443 directly. +// +// Regression test for https://github.com/moby/moby/issues/45610. +func TestDirectRemoteAccessOnExposedPort(t *testing.T) { + // This test checks iptables rules that live in dockerd's netns. In the case + // of rootlesskit, this is not the same netns as the host, so they don't + // have any effect. + // TODO(aker): we need to figure out what we want to do for rootlesskit. + // skip.If(t, testEnv.IsRootless, "rootlesskit has its own netns") + + ctx := setupTest(t) + + const ( + hostIPv4 = "192.168.120.2" + hostIPv6 = "fdbc:277b:d40b::2" + ) + + l3 := networking.NewL3Segment(t, "test-direct-remote-access", + netip.MustParsePrefix("192.168.120.1/24"), + netip.MustParsePrefix("fdbc:277b:d40b::1/64")) + defer l3.Destroy(t) + // "docker" is the host where dockerd is running. + l3.AddHost(t, "docker", networking.CurrentNetns, "test-eth", + netip.MustParsePrefix(hostIPv4+"/24"), + netip.MustParsePrefix(hostIPv6+"/64")) + l3.AddHost(t, "attacker", "test-direct-remote-access-attacker", "eth0", + netip.MustParsePrefix("192.168.120.3/24"), + netip.MustParsePrefix("fdbc:277b:d40b::3/64")) + + d := daemon.New(t) + d.StartWithBusybox(ctx, t) + defer d.Stop(t) + + c := d.NewClientT(t) + defer c.Close() + for _, tc := range []struct { + name string + gwMode string + gwAddr netip.Prefix + ipv4Disabled bool + ipv6Disabled bool + }{ + { + name: "NAT/IPv4", + gwMode: "nat", + gwAddr: netip.MustParsePrefix("172.24.10.1/24"), + }, + { + name: "NAT/IPv6", + gwMode: "nat", + gwAddr: netip.MustParsePrefix("fda9:a651:db6d::1/64"), + }, + { + name: "NAT unprotected/IPv4", + gwMode: "nat-unprotected", + gwAddr: netip.MustParsePrefix("172.24.10.1/24"), + }, + { + name: "NAT unprotected/IPv6", + gwMode: "nat-unprotected", + gwAddr: netip.MustParsePrefix("fda9:a651:db6d::1/64"), + }, + { + name: "Proxy/IPv4", + gwMode: "nat", + gwAddr: netip.MustParsePrefix("fd05:b021:403f::1/64"), + ipv4Disabled: true, + }, + { + name: "Proxy/IPv6", + gwMode: "nat", + gwAddr: netip.MustParsePrefix("172.24.11.1/24"), + ipv6Disabled: true, + }, + { + name: "Routed/IPv4", + gwMode: "routed", + gwAddr: netip.MustParsePrefix("172.24.12.1/24"), + }, + { + name: "Routed/IPv6", + gwMode: "routed", + gwAddr: netip.MustParsePrefix("fd82:5787:b217::1/64"), + }, + } { + t.Run(tc.name, func(t *testing.T) { + skip.If(t, tc.gwMode == "routed" && testEnv.IsRootless(), "rootlesskit doesn't support routed mode as it's running in a separate netns") + + testutil.StartSpan(ctx, t) + + nwOpts := []func(*networktypes.CreateOptions){ + network.WithIPAM(tc.gwAddr.Masked().String(), tc.gwAddr.Addr().String()), + network.WithOption(bridge.IPv4GatewayMode, tc.gwMode), + network.WithOption(bridge.IPv6GatewayMode, tc.gwMode), + } + + if tc.ipv4Disabled { + nwOpts = append(nwOpts, network.WithIPv4Disabled()) + } + if tc.ipv6Disabled { + nwOpts = append(nwOpts, network.WithIPv6Disabled()) + } + if tc.gwAddr.Addr().Is6() { + nwOpts = append(nwOpts, network.WithIPv6()) + } + + const bridgeName = "brattacked" + network.CreateNoError(ctx, t, c, bridgeName, append(nwOpts, + network.WithDriver("bridge"), + network.WithOption(bridge.BridgeName, bridgeName), + )...) + defer network.RemoveNoError(ctx, t, c, bridgeName) + + const hostPort = "5000" + hostIP := hostIPv4 + if tc.gwAddr.Addr().Is6() { + hostIP = hostIPv6 + } + + ctrIP := tc.gwAddr.Addr().Next() + + test := func(t *testing.T, host networking.Host, daddr, payload string) bool { + serverID := container.Run(ctx, t, c, + container.WithName(sanitizeCtrName(t.Name()+"-server")), + container.WithCmd("nc", "-lup", "5000"), + container.WithExposedPorts("5000/udp"), + container.WithPortMap(nat.PortMap{"5000/udp": {{HostPort: hostPort}}}), + container.WithNetworkMode(bridgeName), + container.WithEndpointSettings(bridgeName, &networktypes.EndpointSettings{ + IPAddress: ctrIP.String(), + IPPrefixLen: ctrIP.BitLen(), + })) + defer c.ContainerRemove(ctx, serverID, containertypes.RemoveOptions{Force: true}) + + return sendPayloadFromHost(t, host, daddr, hostPort, payload, func() bool { + logs := getContainerStdout(t, ctx, c, serverID) + return strings.Contains(logs, payload) + }) + } + + if tc.gwMode != "routed" { + // Send a payload to the port mapped on the host -- this should work + res := test(t, l3.Hosts["attacker"], hostIP, "foobar") + assert.Assert(t, res, "Remote host should have access to port published on the host, but no payload was received by the container") + } + + // Now send a payload directly to the container. With gw_mode=routed, + // this should work. With gw_mode=nat, this should fail. + expDirectAccess := tc.gwMode == "routed" || (tc.gwMode == "nat-unprotected" && !testEnv.IsRootless()) + + l3.Hosts["attacker"].Run(t, "ip", "route", "add", tc.gwAddr.Masked().String(), "via", hostIP, "dev", "eth0") + defer l3.Hosts["attacker"].Run(t, "ip", "route", "delete", tc.gwAddr.Masked().String(), "via", hostIP, "dev", "eth0") + + res := test(t, l3.Hosts["attacker"], ctrIP.String(), "bar baz") + if expDirectAccess { + assert.Assert(t, res, "Remote host should have direct access to the published port, but no payload was received by the container") + } else { + assert.Assert(t, !res, "Remote host should not have direct access to the published port, but payload was received by the container") + } + }) + } +} + +// Send a payload to daddr:dport a few times from the 'host' netns. Stop +// sending payloads when 'check' returns true. Return the result of 'check'. +// +// UDP is preferred here as it's a one-way, connectionless protocol. With TCP +// the three-way handshake has to be completed before sending a payload, but +// since some test cases try to spoof the loopback address, the 'attacker host' +// will drop the SYN-ACK by default (because the source addr will be considered +// invalid / non-routable). This would require further tuning to make it work. +// With UDP, this problem doesn't exist - the payload can be sent straight away. +// But UDP is inherently unreliable, so we need to send the payload multiple +// times. +func sendPayloadFromHost(t *testing.T, host networking.Host, daddr, dport, payload string, check func() bool) bool { + var res bool + host.Do(t, func() { + for i := 0; i < 10; i++ { + t.Logf("Sending probe #%d to %s:%s from host %s", i, daddr, dport, host.Name) + icmd.RunCommand("/bin/sh", "-c", fmt.Sprintf("echo '%s' | nc -w1 -u %s %s", payload, daddr, dport)).Assert(t, icmd.Success) + + res = check() + if res { + return + } + + time.Sleep(50 * time.Millisecond) + } + }) + return res +} + func getContainerStdout(t *testing.T, ctx context.Context, c *client.Client, ctrID string) string { logReader, err := c.ContainerLogs(ctx, ctrID, containertypes.LogsOptions{ ShowStdout: true, diff --git a/internal/testutils/networking/l3_segment_linux.go b/internal/testutils/networking/l3_segment_linux.go index 418e596e70..949c1fd7ec 100644 --- a/internal/testutils/networking/l3_segment_linux.go +++ b/internal/testutils/networking/l3_segment_linux.go @@ -45,7 +45,7 @@ func NewL3Segment(t *testing.T, nsName string, addrs ...netip.Prefix) *L3Segment Hosts: map[string]Host{}, } - l3.bridge = newHost(t, nsName, "br0") + l3.bridge = newHost(t, "bridge", nsName, "br0") defer func() { if t.Failed() { l3.Destroy(t) @@ -70,7 +70,7 @@ func (l3 *L3Segment) AddHost(t *testing.T, hostname, nsName, ifname string, addr t.Fatalf("hostname too long") } - host := newHost(t, nsName, ifname) + host := newHost(t, hostname, nsName, ifname) l3.Hosts[hostname] = host host.MustRun(t, "ip", "link", "add", hostname, "netns", l3.bridge.ns, "type", "veth", "peer", "name", host.Iface) @@ -83,6 +83,7 @@ func (l3 *L3Segment) AddHost(t *testing.T, hostname, nsName, ifname string, addr } func (l3 *L3Segment) Destroy(t *testing.T) { + t.Helper() for _, host := range l3.Hosts { host.Destroy(t) } @@ -90,11 +91,12 @@ func (l3 *L3Segment) Destroy(t *testing.T) { } type Host struct { + Name string Iface string // Iface is the interface name in the host network namespace. ns string // ns is the network namespace name. } -func newHost(t *testing.T, nsName, ifname string) Host { +func newHost(t *testing.T, hostname, nsName, ifname string) Host { t.Helper() if len(ifname) >= syscall.IFNAMSIZ { @@ -109,6 +111,7 @@ func newHost(t *testing.T, nsName, ifname string) Host { } return Host{ + Name: hostname, Iface: ifname, ns: nsName, } diff --git a/libnetwork/drivers/bridge/port_mapping_linux.go b/libnetwork/drivers/bridge/port_mapping_linux.go index dd8e2801ff..43f6c91bd3 100644 --- a/libnetwork/drivers/bridge/port_mapping_linux.go +++ b/libnetwork/drivers/bridge/port_mapping_linux.go @@ -176,9 +176,9 @@ func (n *bridgeNetwork) addPortMappings( } for i := range bindings { - if pdc != nil && bindings[i].HostPort != 0 { + b := bindings[i] + if pdc != nil && b.HostPort != 0 { var err error - b := &bindings[i] hip, ok := netip.AddrFromSlice(b.HostIP) if !ok { return nil, fmt.Errorf("invalid host IP address in %s", b) @@ -187,12 +187,15 @@ func (n *bridgeNetwork) addPortMappings( if !ok { return nil, fmt.Errorf("invalid child host IP address %s in %s", b.childHostIP, b) } - b.portDriverRemove, err = pdc.AddPort(ctx, b.Proto.String(), hip, chip, int(b.HostPort)) + bindings[i].portDriverRemove, err = pdc.AddPort(ctx, b.Proto.String(), hip, chip, int(b.HostPort)) if err != nil { return nil, err } } - if err := n.setPerPortIptables(bindings[i], true); err != nil { + if err := n.setPerPortIptables(b, true); err != nil { + return nil, err + } + if err := n.filterDirectAccess(b, true); err != nil { return nil, err } } @@ -744,6 +747,9 @@ func (n *bridgeNetwork) releasePortBindings(pbs []portBinding) error { if err := n.setPerPortIptables(pb, false); err != nil { errs = append(errs, fmt.Errorf("failed to remove iptables rules for port mapping %s: %w", pb, err)) } + if err := n.filterDirectAccess(pb, false); err != nil { + errs = append(errs, err) + } if pb.HostPort > 0 { portallocator.Get().ReleasePort(pb.childHostIP, pb.Proto.String(), int(pb.HostPort)) } @@ -866,6 +872,38 @@ func setPerPortForwarding(b portBinding, ipv iptables.IPVersion, bridgeName stri return nil } +// filterDirectAccess adds an iptables rule that drops 'direct' remote +// connections made to the container's IP address, when the network gateway +// mode is "nat". +// +// This is a no-op if the gw_mode is "nat-unprotected" or "routed". +func (n *bridgeNetwork) filterDirectAccess(b portBinding, enable bool) error { + ipv := iptables.IPv4 + if b.IP.To4() == nil { + ipv = iptables.IPv6 + } + + // gw_mode=nat-unprotected means there's minimal security for NATed ports, + // so don't filter direct access. + if n.gwMode(ipv).unprotected() || n.gwMode(ipv).routed() { + return nil + } + + bridgeName := n.getNetworkBridgeName() + drop := iptables.Rule{IPVer: ipv, Table: iptables.Raw, Chain: "PREROUTING", Args: []string{ + "-p", b.Proto.String(), + "-d", b.IP.String(), // Container IP address + "--dport", strconv.Itoa(int(b.Port)), // Container port + "!", "-i", bridgeName, + "-j", "DROP", + }} + if err := appendOrDelChainRule(drop, "DIRECT ACCESS FILTERING - DROP", enable); err != nil { + return err + } + + return nil +} + func (n *bridgeNetwork) reapplyPerPortIptables4() { n.reapplyPerPortIptables(func(b portBinding) bool { return b.IP.To4() != nil }) } diff --git a/libnetwork/iptables/iptables.go b/libnetwork/iptables/iptables.go index 3e1110e468..33f23e9a1e 100644 --- a/libnetwork/iptables/iptables.go +++ b/libnetwork/iptables/iptables.go @@ -49,6 +49,8 @@ const ( Filter Table = "filter" // Mangle table is used for mangling the packet. Mangle Table = "mangle" + // Raw table is used for filtering packets before they are NATed. + Raw Table = "raw" ) // IPVersion refers to IP version, v4 or v6