libnet/d/bridge: port mappings: drop direct-access when gw_mode=nat

When a NAT-based port mapping is created, the daemon adds a DNAT rule in
nat-DOCKER to replace the dest addr with the container IP. However, the
daemon never sets up rules to filter packets destined directly to the
container port. This allows a rogue neighbor (ie. a host that shares a
L2 segment with the host) to send packets directly to the container on
its container-side exposed port.

For instance, if container port 5000 is mapped to host port 6000, a
neighbor could send packets directly to the container on its port 5000.

Since nat-DOCKER mangles the dest addr, and the nat table forbids DROP
rules, this change adds a new rule in the raw-PREROUTING chain to filter
ingress connections targeting the container's IP address.

This filtering is only done when gw_mode=nat. For the unprotected
variant, no filtering is done.

Signed-off-by: Albin Kerouanton <albinker@gmail.com>
This commit is contained in:
Albin Kerouanton
2025-01-22 08:42:37 +01:00
parent 8474153e13
commit 27adcd596b
9 changed files with 314 additions and 11 deletions

View File

@@ -542,6 +542,7 @@ RUN --mount=type=cache,sharing=locked,id=moby-dev-aptlib,target=/var/lib/apt \
libprotobuf-c1 \
libyajl2 \
net-tools \
netcat-openbsd \
patch \
pigz \
sudo \

View File

@@ -27,6 +27,22 @@ func WithIPv6() func(*network.CreateOptions) {
}
}
// WithIPv4Disabled makes sure IPv4 is disabled on the network.
func WithIPv4Disabled() func(*network.CreateOptions) {
return func(n *network.CreateOptions) {
enable := false
n.EnableIPv4 = &enable
}
}
// WithIPv6Disabled makes sure IPv6 is disabled on the network.
func WithIPv6Disabled() func(*network.CreateOptions) {
return func(n *network.CreateOptions) {
enable := false
n.EnableIPv6 = &enable
}
}
// WithInternal enables Internal flag on the create network request
func WithInternal() func(*network.CreateOptions) {
return func(n *network.CreateOptions) {

View File

@@ -92,7 +92,7 @@ Note that:
[1]: https://github.com/moby/moby/blob/675c2ac2db93e38bb9c5a6615d4155a969535fd9/libnetwork/drivers/bridge/port_mapping_linux.go#L795
[2]: https://github.com/robmry/moby/blob/52c89d467fc5326149e4bbb8903d23589b66ff0d/libnetwork/drivers/bridge/setup_ip_tables_linux.go#L252
And the corresponding nat table:
The corresponding nat table:
Chain PREROUTING (policy ACCEPT 0 packets, 0 bytes)
num pkts bytes target prot opt in out source destination
@@ -135,3 +135,27 @@ And the corresponding nat table:
</details>
And the raw table:
Chain PREROUTING (policy ACCEPT 0 packets, 0 bytes)
num pkts bytes target prot opt in out source destination
1 0 0 DROP 6 -- !bridge1 * 0.0.0.0/0 192.0.2.2 tcp dpt:80
Chain OUTPUT (policy ACCEPT 0 packets, 0 bytes)
num pkts bytes target prot opt in out source destination
<details>
<summary>iptables commands</summary>
-P PREROUTING ACCEPT
-P OUTPUT ACCEPT
-A PREROUTING -d 192.0.2.2/32 ! -i bridge1 -p tcp -m tcp --dport 80 -j DROP
</details>
[filterDirectAccess][3] adds a DROP rule to the raw-PREROUTING chain to block direct remote access to the mapped port.
[3]: https://github.com/search?q=repo%3Amoby%2Fmoby%20filterDirectAccess&type=code

View File

@@ -188,6 +188,8 @@ const (
iptCmdSFilterDocker4 iptCmdType = "SFilterDocker4"
iptCmdLNat4 iptCmdType = "LNat4"
iptCmdSNat4 iptCmdType = "SNat4"
iptCmdLRaw4 iptCmdType = "LRaw4"
iptCmdSRaw4 iptCmdType = "SRaw4"
)
var iptCmds = map[iptCmdType][]string{
@@ -198,6 +200,8 @@ var iptCmds = map[iptCmdType][]string{
iptCmdSFilterDocker4: {"iptables", "-S", "DOCKER"},
iptCmdLNat4: {"iptables", "-nvL", "--line-numbers", "-t", "nat"},
iptCmdSNat4: {"iptables", "-S", "-t", "nat"},
iptCmdLRaw4: {"iptables", "-nvL", "--line-numbers", "-t", "raw"},
iptCmdSRaw4: {"iptables", "-S", "-t", "raw"},
}
func TestBridgeIptablesDoc(t *testing.T) {

View File

@@ -37,7 +37,7 @@ Note that:
[1]: https://github.com/moby/moby/blob/675c2ac2db93e38bb9c5a6615d4155a969535fd9/libnetwork/drivers/bridge/port_mapping_linux.go#L795
[2]: https://github.com/robmry/moby/blob/52c89d467fc5326149e4bbb8903d23589b66ff0d/libnetwork/drivers/bridge/setup_ip_tables_linux.go#L252
And the corresponding nat table:
The corresponding nat table:
{{index . "LNat4"}}
@@ -47,3 +47,18 @@ And the corresponding nat table:
{{index . "SNat4"}}
</details>
And the raw table:
{{index . "LRaw4"}}
<details>
<summary>iptables commands</summary>
{{index . "SRaw4"}}
</details>
[filterDirectAccess][3] adds a DROP rule to the raw-PREROUTING chain to block direct remote access to the mapped port.
[3]: https://github.com/search?q=repo%3Amoby%2Fmoby%20filterDirectAccess&type=code

View File

@@ -728,6 +728,11 @@ func TestDirectRoutingOpenPorts(t *testing.T) {
"nat-unprotected": pingSuccess,
"routed": pingSuccess,
}
expMappedPortHTTP := map[string]string{
"nat": httpFail,
"nat-unprotected": httpSuccess,
"routed": httpSuccess,
}
expUnmappedPortHTTP := map[string]string{
"nat": httpFail,
"nat-unprotected": httpSuccess,
@@ -769,13 +774,13 @@ func TestDirectRoutingOpenPorts(t *testing.T) {
testPing(t, "ping6", networks[gwMode].ipv6, expPingExit[gwMode])
})
t.Run(gwMode+"/v4/http/80", func(t *testing.T) {
testHttp(t, networks[gwMode].ipv4, "80", httpSuccess)
testHttp(t, networks[gwMode].ipv4, "80", expMappedPortHTTP[gwMode])
})
t.Run(gwMode+"/v4/http/81", func(t *testing.T) {
testHttp(t, networks[gwMode].ipv4, "81", expUnmappedPortHTTP[gwMode])
})
t.Run(gwMode+"/v6/http/80", func(t *testing.T) {
testHttp(t, networks[gwMode].ipv6, "80", httpSuccess)
testHttp(t, networks[gwMode].ipv6, "80", expMappedPortHTTP[gwMode])
})
t.Run(gwMode+"/v6/http/81", func(t *testing.T) {
testHttp(t, networks[gwMode].ipv6, "81", expUnmappedPortHTTP[gwMode])
@@ -874,6 +879,201 @@ func TestAccessPublishedPortFromAnotherNetwork(t *testing.T) {
}
}
// TestDirectRemoteAccessOnExposedPort checks that remote hosts can't directly
// reach a container on one of its exposed port. That is, if container has the
// IP address 172.17.24.2, and its port 443 is exposed on the host, no remote
// host should be able to reach 172.17.24.2:443 directly.
//
// Regression test for https://github.com/moby/moby/issues/45610.
func TestDirectRemoteAccessOnExposedPort(t *testing.T) {
// This test checks iptables rules that live in dockerd's netns. In the case
// of rootlesskit, this is not the same netns as the host, so they don't
// have any effect.
// TODO(aker): we need to figure out what we want to do for rootlesskit.
// skip.If(t, testEnv.IsRootless, "rootlesskit has its own netns")
ctx := setupTest(t)
const (
hostIPv4 = "192.168.120.2"
hostIPv6 = "fdbc:277b:d40b::2"
)
l3 := networking.NewL3Segment(t, "test-direct-remote-access",
netip.MustParsePrefix("192.168.120.1/24"),
netip.MustParsePrefix("fdbc:277b:d40b::1/64"))
defer l3.Destroy(t)
// "docker" is the host where dockerd is running.
l3.AddHost(t, "docker", networking.CurrentNetns, "test-eth",
netip.MustParsePrefix(hostIPv4+"/24"),
netip.MustParsePrefix(hostIPv6+"/64"))
l3.AddHost(t, "attacker", "test-direct-remote-access-attacker", "eth0",
netip.MustParsePrefix("192.168.120.3/24"),
netip.MustParsePrefix("fdbc:277b:d40b::3/64"))
d := daemon.New(t)
d.StartWithBusybox(ctx, t)
defer d.Stop(t)
c := d.NewClientT(t)
defer c.Close()
for _, tc := range []struct {
name string
gwMode string
gwAddr netip.Prefix
ipv4Disabled bool
ipv6Disabled bool
}{
{
name: "NAT/IPv4",
gwMode: "nat",
gwAddr: netip.MustParsePrefix("172.24.10.1/24"),
},
{
name: "NAT/IPv6",
gwMode: "nat",
gwAddr: netip.MustParsePrefix("fda9:a651:db6d::1/64"),
},
{
name: "NAT unprotected/IPv4",
gwMode: "nat-unprotected",
gwAddr: netip.MustParsePrefix("172.24.10.1/24"),
},
{
name: "NAT unprotected/IPv6",
gwMode: "nat-unprotected",
gwAddr: netip.MustParsePrefix("fda9:a651:db6d::1/64"),
},
{
name: "Proxy/IPv4",
gwMode: "nat",
gwAddr: netip.MustParsePrefix("fd05:b021:403f::1/64"),
ipv4Disabled: true,
},
{
name: "Proxy/IPv6",
gwMode: "nat",
gwAddr: netip.MustParsePrefix("172.24.11.1/24"),
ipv6Disabled: true,
},
{
name: "Routed/IPv4",
gwMode: "routed",
gwAddr: netip.MustParsePrefix("172.24.12.1/24"),
},
{
name: "Routed/IPv6",
gwMode: "routed",
gwAddr: netip.MustParsePrefix("fd82:5787:b217::1/64"),
},
} {
t.Run(tc.name, func(t *testing.T) {
skip.If(t, tc.gwMode == "routed" && testEnv.IsRootless(), "rootlesskit doesn't support routed mode as it's running in a separate netns")
testutil.StartSpan(ctx, t)
nwOpts := []func(*networktypes.CreateOptions){
network.WithIPAM(tc.gwAddr.Masked().String(), tc.gwAddr.Addr().String()),
network.WithOption(bridge.IPv4GatewayMode, tc.gwMode),
network.WithOption(bridge.IPv6GatewayMode, tc.gwMode),
}
if tc.ipv4Disabled {
nwOpts = append(nwOpts, network.WithIPv4Disabled())
}
if tc.ipv6Disabled {
nwOpts = append(nwOpts, network.WithIPv6Disabled())
}
if tc.gwAddr.Addr().Is6() {
nwOpts = append(nwOpts, network.WithIPv6())
}
const bridgeName = "brattacked"
network.CreateNoError(ctx, t, c, bridgeName, append(nwOpts,
network.WithDriver("bridge"),
network.WithOption(bridge.BridgeName, bridgeName),
)...)
defer network.RemoveNoError(ctx, t, c, bridgeName)
const hostPort = "5000"
hostIP := hostIPv4
if tc.gwAddr.Addr().Is6() {
hostIP = hostIPv6
}
ctrIP := tc.gwAddr.Addr().Next()
test := func(t *testing.T, host networking.Host, daddr, payload string) bool {
serverID := container.Run(ctx, t, c,
container.WithName(sanitizeCtrName(t.Name()+"-server")),
container.WithCmd("nc", "-lup", "5000"),
container.WithExposedPorts("5000/udp"),
container.WithPortMap(nat.PortMap{"5000/udp": {{HostPort: hostPort}}}),
container.WithNetworkMode(bridgeName),
container.WithEndpointSettings(bridgeName, &networktypes.EndpointSettings{
IPAddress: ctrIP.String(),
IPPrefixLen: ctrIP.BitLen(),
}))
defer c.ContainerRemove(ctx, serverID, containertypes.RemoveOptions{Force: true})
return sendPayloadFromHost(t, host, daddr, hostPort, payload, func() bool {
logs := getContainerStdout(t, ctx, c, serverID)
return strings.Contains(logs, payload)
})
}
if tc.gwMode != "routed" {
// Send a payload to the port mapped on the host -- this should work
res := test(t, l3.Hosts["attacker"], hostIP, "foobar")
assert.Assert(t, res, "Remote host should have access to port published on the host, but no payload was received by the container")
}
// Now send a payload directly to the container. With gw_mode=routed,
// this should work. With gw_mode=nat, this should fail.
expDirectAccess := tc.gwMode == "routed" || (tc.gwMode == "nat-unprotected" && !testEnv.IsRootless())
l3.Hosts["attacker"].Run(t, "ip", "route", "add", tc.gwAddr.Masked().String(), "via", hostIP, "dev", "eth0")
defer l3.Hosts["attacker"].Run(t, "ip", "route", "delete", tc.gwAddr.Masked().String(), "via", hostIP, "dev", "eth0")
res := test(t, l3.Hosts["attacker"], ctrIP.String(), "bar baz")
if expDirectAccess {
assert.Assert(t, res, "Remote host should have direct access to the published port, but no payload was received by the container")
} else {
assert.Assert(t, !res, "Remote host should not have direct access to the published port, but payload was received by the container")
}
})
}
}
// Send a payload to daddr:dport a few times from the 'host' netns. Stop
// sending payloads when 'check' returns true. Return the result of 'check'.
//
// UDP is preferred here as it's a one-way, connectionless protocol. With TCP
// the three-way handshake has to be completed before sending a payload, but
// since some test cases try to spoof the loopback address, the 'attacker host'
// will drop the SYN-ACK by default (because the source addr will be considered
// invalid / non-routable). This would require further tuning to make it work.
// With UDP, this problem doesn't exist - the payload can be sent straight away.
// But UDP is inherently unreliable, so we need to send the payload multiple
// times.
func sendPayloadFromHost(t *testing.T, host networking.Host, daddr, dport, payload string, check func() bool) bool {
var res bool
host.Do(t, func() {
for i := 0; i < 10; i++ {
t.Logf("Sending probe #%d to %s:%s from host %s", i, daddr, dport, host.Name)
icmd.RunCommand("/bin/sh", "-c", fmt.Sprintf("echo '%s' | nc -w1 -u %s %s", payload, daddr, dport)).Assert(t, icmd.Success)
res = check()
if res {
return
}
time.Sleep(50 * time.Millisecond)
}
})
return res
}
func getContainerStdout(t *testing.T, ctx context.Context, c *client.Client, ctrID string) string {
logReader, err := c.ContainerLogs(ctx, ctrID, containertypes.LogsOptions{
ShowStdout: true,

View File

@@ -45,7 +45,7 @@ func NewL3Segment(t *testing.T, nsName string, addrs ...netip.Prefix) *L3Segment
Hosts: map[string]Host{},
}
l3.bridge = newHost(t, nsName, "br0")
l3.bridge = newHost(t, "bridge", nsName, "br0")
defer func() {
if t.Failed() {
l3.Destroy(t)
@@ -70,7 +70,7 @@ func (l3 *L3Segment) AddHost(t *testing.T, hostname, nsName, ifname string, addr
t.Fatalf("hostname too long")
}
host := newHost(t, nsName, ifname)
host := newHost(t, hostname, nsName, ifname)
l3.Hosts[hostname] = host
host.MustRun(t, "ip", "link", "add", hostname, "netns", l3.bridge.ns, "type", "veth", "peer", "name", host.Iface)
@@ -83,6 +83,7 @@ func (l3 *L3Segment) AddHost(t *testing.T, hostname, nsName, ifname string, addr
}
func (l3 *L3Segment) Destroy(t *testing.T) {
t.Helper()
for _, host := range l3.Hosts {
host.Destroy(t)
}
@@ -90,11 +91,12 @@ func (l3 *L3Segment) Destroy(t *testing.T) {
}
type Host struct {
Name string
Iface string // Iface is the interface name in the host network namespace.
ns string // ns is the network namespace name.
}
func newHost(t *testing.T, nsName, ifname string) Host {
func newHost(t *testing.T, hostname, nsName, ifname string) Host {
t.Helper()
if len(ifname) >= syscall.IFNAMSIZ {
@@ -109,6 +111,7 @@ func newHost(t *testing.T, nsName, ifname string) Host {
}
return Host{
Name: hostname,
Iface: ifname,
ns: nsName,
}

View File

@@ -176,9 +176,9 @@ func (n *bridgeNetwork) addPortMappings(
}
for i := range bindings {
if pdc != nil && bindings[i].HostPort != 0 {
b := bindings[i]
if pdc != nil && b.HostPort != 0 {
var err error
b := &bindings[i]
hip, ok := netip.AddrFromSlice(b.HostIP)
if !ok {
return nil, fmt.Errorf("invalid host IP address in %s", b)
@@ -187,12 +187,15 @@ func (n *bridgeNetwork) addPortMappings(
if !ok {
return nil, fmt.Errorf("invalid child host IP address %s in %s", b.childHostIP, b)
}
b.portDriverRemove, err = pdc.AddPort(ctx, b.Proto.String(), hip, chip, int(b.HostPort))
bindings[i].portDriverRemove, err = pdc.AddPort(ctx, b.Proto.String(), hip, chip, int(b.HostPort))
if err != nil {
return nil, err
}
}
if err := n.setPerPortIptables(bindings[i], true); err != nil {
if err := n.setPerPortIptables(b, true); err != nil {
return nil, err
}
if err := n.filterDirectAccess(b, true); err != nil {
return nil, err
}
}
@@ -744,6 +747,9 @@ func (n *bridgeNetwork) releasePortBindings(pbs []portBinding) error {
if err := n.setPerPortIptables(pb, false); err != nil {
errs = append(errs, fmt.Errorf("failed to remove iptables rules for port mapping %s: %w", pb, err))
}
if err := n.filterDirectAccess(pb, false); err != nil {
errs = append(errs, err)
}
if pb.HostPort > 0 {
portallocator.Get().ReleasePort(pb.childHostIP, pb.Proto.String(), int(pb.HostPort))
}
@@ -866,6 +872,38 @@ func setPerPortForwarding(b portBinding, ipv iptables.IPVersion, bridgeName stri
return nil
}
// filterDirectAccess adds an iptables rule that drops 'direct' remote
// connections made to the container's IP address, when the network gateway
// mode is "nat".
//
// This is a no-op if the gw_mode is "nat-unprotected" or "routed".
func (n *bridgeNetwork) filterDirectAccess(b portBinding, enable bool) error {
ipv := iptables.IPv4
if b.IP.To4() == nil {
ipv = iptables.IPv6
}
// gw_mode=nat-unprotected means there's minimal security for NATed ports,
// so don't filter direct access.
if n.gwMode(ipv).unprotected() || n.gwMode(ipv).routed() {
return nil
}
bridgeName := n.getNetworkBridgeName()
drop := iptables.Rule{IPVer: ipv, Table: iptables.Raw, Chain: "PREROUTING", Args: []string{
"-p", b.Proto.String(),
"-d", b.IP.String(), // Container IP address
"--dport", strconv.Itoa(int(b.Port)), // Container port
"!", "-i", bridgeName,
"-j", "DROP",
}}
if err := appendOrDelChainRule(drop, "DIRECT ACCESS FILTERING - DROP", enable); err != nil {
return err
}
return nil
}
func (n *bridgeNetwork) reapplyPerPortIptables4() {
n.reapplyPerPortIptables(func(b portBinding) bool { return b.IP.To4() != nil })
}

View File

@@ -49,6 +49,8 @@ const (
Filter Table = "filter"
// Mangle table is used for mangling the packet.
Mangle Table = "mangle"
// Raw table is used for filtering packets before they are NATed.
Raw Table = "raw"
)
// IPVersion refers to IP version, v4 or v6