iptables: Direct routing DROP rules per-container, not per-port

Commit 27adcd5 ("libnet/d/bridge: drop connections to lo mappings, and
direct remote connections") introduced an iptables rule to drop 'direct'
remote connections made to the container's IP address - for each
published port on the container.

The normal filter-FORWARD rules would then drop packets sent directly to
unpublished ports. This rule was only created along with the rest of port
publishing (when a container's endpoint was selected as its gateway). Until
then, all packets addressed directly to the container's ports were dropped
by the filter-FORWARD rules.

But, the rule doesn't need to be per-port. Just drop packets sent
directly to a container's address unless they originate on the host.

That means fewer rules, that can be created along with the endpoint (then
directly-routed get dropped at the same point whether or not the endpoint
is currently the gateway - very slightly earlier than when it's not the
gateway).

Signed-off-by: Rob Murray <rob.murray@docker.com>
This commit is contained in:
Rob Murray
2025-04-17 12:51:57 +01:00
parent 59920a733f
commit a0ff0a361e
8 changed files with 123 additions and 14 deletions

View File

@@ -139,8 +139,8 @@ The filter and nat tables are identical to [nat mode][0]:
Chain PREROUTING (policy ACCEPT 0 packets, 0 bytes)
num pkts bytes target prot opt in out source destination
1 0 0 DROP 6 -- !lo * 0.0.0.0/0 127.0.0.1 tcp dpt:8080
2 0 0 DROP 6 -- !bridge1 * 0.0.0.0/0 192.0.2.2 tcp dpt:80
1 0 0 DROP 0 -- !bridge1 * 0.0.0.0/0 192.0.2.2
2 0 0 DROP 6 -- !lo * 0.0.0.0/0 127.0.0.1 tcp dpt:8080
Chain OUTPUT (policy ACCEPT 0 packets, 0 bytes)
num pkts bytes target prot opt in out source destination
@@ -151,8 +151,8 @@ The filter and nat tables are identical to [nat mode][0]:
-P PREROUTING ACCEPT
-P OUTPUT ACCEPT
-A PREROUTING -d 192.0.2.2/32 ! -i bridge1 -j DROP
-A PREROUTING -d 127.0.0.1/32 ! -i lo -p tcp -m tcp --dport 8080 -j DROP
-A PREROUTING -d 192.0.2.2/32 ! -i bridge1 -p tcp -m tcp --dport 80 -j DROP
</details>

View File

@@ -163,7 +163,7 @@ And the raw table:
Chain PREROUTING (policy ACCEPT 0 packets, 0 bytes)
num pkts bytes target prot opt in out source destination
1 0 0 DROP 6 -- !bridge1 * 0.0.0.0/0 192.0.2.2 tcp dpt:80
1 0 0 DROP 0 -- !bridge1 * 0.0.0.0/0 192.0.2.2
Chain OUTPUT (policy ACCEPT 0 packets, 0 bytes)
num pkts bytes target prot opt in out source destination
@@ -174,7 +174,7 @@ And the raw table:
-P PREROUTING ACCEPT
-P OUTPUT ACCEPT
-A PREROUTING -d 192.0.2.2/32 ! -i bridge1 -p tcp -m tcp --dport 80 -j DROP
-A PREROUTING -d 192.0.2.2/32 ! -i bridge1 -j DROP
</details>

View File

@@ -1,4 +1,4 @@
-P PREROUTING ACCEPT
-P OUTPUT ACCEPT
-A PREROUTING -d 192.168.0.2/32 ! -i docker0 -j DROP
-A PREROUTING -d 127.0.0.1/32 ! -i lo -p tcp -m tcp --dport 8080 -j DROP
-A PREROUTING -d 192.168.0.2/32 ! -i docker0 -p tcp -m tcp --dport 80 -j DROP

View File

@@ -1,3 +1,3 @@
-P PREROUTING ACCEPT
-P OUTPUT ACCEPT
-A PREROUTING -d fd30:1159:a755::2/128 ! -i docker0 -p tcp -m tcp --dport 80 -j DROP
-A PREROUTING -d fd30:1159:a755::2/128 ! -i docker0 -j DROP

View File

@@ -1174,6 +1174,11 @@ func (d *driver) CreateEndpoint(ctx context.Context, nid, eid string, ifInfo dri
}
}
netip4, netip6 := endpoint.netipAddrs()
if err := n.iptablesNetwork.AddEndpoint(ctx, netip4, netip6); err != nil {
return err
}
// Up the host interface after finishing all netlink configuration
if err = d.linkUp(ctx, host); err != nil {
return fmt.Errorf("could not set link up for host interface %s: %v", hostIfName, err)
@@ -1190,6 +1195,18 @@ func (d *driver) CreateEndpoint(ctx context.Context, nid, eid string, ifInfo dri
return nil
}
// netipAddrs converts ep.addr and ep.addrv6 from net.IPNet to netip.Addr. If an address
// is non-nil, it's assumed to be valid.
func (ep *bridgeEndpoint) netipAddrs() (v4, v6 netip.Addr) {
if ep.addr != nil {
v4, _ = netip.AddrFromSlice(ep.addr.IP)
}
if ep.addrv6 != nil {
v6, _ = netip.AddrFromSlice(ep.addrv6.IP)
}
return v4, v6
}
// createVeth creates a veth device with one end in the container's network namespace,
// if it can get hold of the netns path and open the handles. In that case, it returns
// a netlink handle in the container's namespace that must be closed by the caller.
@@ -1282,6 +1299,11 @@ func (d *driver) DeleteEndpoint(nid, eid string) error {
return endpointNotFoundError(eid)
}
netip4, netip6 := ep.netipAddrs()
if err := n.iptablesNetwork.DelEndpoint(context.TODO(), netip4, netip6); err != nil {
return err
}
// Remove it
n.Lock()
delete(n.endpoints, eid)

View File

@@ -87,6 +87,13 @@ func (d *driver) populateEndpoints() error {
continue
}
n.endpoints[ep.id] = ep
netip4, netip6 := ep.netipAddrs()
if err := n.iptablesNetwork.AddEndpoint(context.TODO(), netip4, netip6); err != nil {
log.G(context.TODO()).WithFields(log.Fields{
"error": err,
"ep.id": ep.id,
}).Warn("Failed to restore per-endpoint firewall rules")
}
n.restorePortAllocations(ep)
log.G(context.TODO()).Debugf("Endpoint (%.7s) restored to network (%.7s)", ep.id, ep.nid)
}

View File

@@ -0,0 +1,64 @@
//go:build linux
package iptabler
import (
"context"
"net/netip"
"github.com/docker/docker/libnetwork/iptables"
)
func (n *Network) AddEndpoint(ctx context.Context, epIPv4, epIPv6 netip.Addr) error {
return n.modEndpoint(ctx, epIPv4, epIPv6, true)
}
func (n *Network) DelEndpoint(ctx context.Context, epIPv4, epIPv6 netip.Addr) error {
return n.modEndpoint(ctx, epIPv4, epIPv6, false)
}
func (n *Network) modEndpoint(ctx context.Context, epIPv4, epIPv6 netip.Addr, enable bool) error {
if n.ipt.IPv4 && epIPv4.IsValid() {
if err := n.filterDirectAccess(ctx, iptables.IPv4, n.Config4, epIPv4, enable); err != nil {
return err
}
}
if n.ipt.IPv6 && epIPv6.IsValid() {
if err := n.filterDirectAccess(ctx, iptables.IPv6, n.Config6, epIPv6, enable); err != nil {
return err
}
}
return nil
}
// filterDirectAccess drops packets addressed directly to the container's IP address,
// when direct routing is not permitted by network configuration.
//
// It is a no-op if:
// - the network is internal
// - gateway mode is "nat-unprotected" or "routed".
// - "raw" rules are disabled (possibly because the host doesn't have the necessary
// kernel support).
//
// Packets originating on the bridge's own interface and addressed directly to the
// container are allowed - the host always has direct access to its own containers
// (it doesn't need to use the port mapped to its own addresses, although it can).
func (n *Network) filterDirectAccess(ctx context.Context, ipv iptables.IPVersion, config NetworkConfigFam, epIP netip.Addr, enable bool) error {
if n.Internal || config.Unprotected || config.Routed {
return nil
}
// For config that may change between daemon restarts, make sure rules are
// removed - if the container was left running when the daemon stopped, and
// direct routing has since been disabled, the rules need to be deleted when
// cleanup happens on restart. This also means a change in config over a
// live-restore restart will take effect.
if rawRulesDisabled(ctx) {
enable = false
}
accept := iptables.Rule{IPVer: ipv, Table: iptables.Raw, Chain: "PREROUTING", Args: []string{
"-d", epIP.String(),
"!", "-i", n.IfName,
"-j", "DROP",
}}
return appendOrDelChainRule(accept, "DIRECT ACCESS FILTERING - DROP", enable)
}

View File

@@ -51,7 +51,7 @@ func (n *Network) setPerPortIptables(ctx context.Context, b types.PortBinding, e
return err
}
if err := n.filterDirectAccess(ctx, b, enable); err != nil {
if err := n.dropLegacyFilterDirectAccess(ctx, b); err != nil {
return err
}
@@ -203,12 +203,28 @@ func filterPortMappedOnLoopback(ctx context.Context, b types.PortBinding, hostIP
return nil
}
// filterDirectAccess adds an iptables rule that drops 'direct' remote
// connections made to the container's IP address, when the network gateway
// mode is "nat".
// dropLegacyFilterDirectAccess deletes a rule that was introduced in 28.0.0 to
// drop 'direct' remote connections made to the container's IP address - for
// each published port on the container.
//
// This is a no-op if the gw_mode is "nat-unprotected" or "routed".
func (n *Network) filterDirectAccess(ctx context.Context, b types.PortBinding, enable bool) error {
// The normal filter-FORWARD rules would then drop packets sent directly to
// unpublished ports. This rule was only created along with the rest of port
// publishing (when a container's endpoint was selected as its gateway). Until
// then, all packets addressed directly to the container's ports were dropped
// by the filter-FORWARD rules.
//
// Since 28.0.2, direct routed packets sent to a container's address are all
// dropped in a raw-PREROUTING rule - it doesn't need to be per-port (so, fewer
// rules), and it can be created along with the endpoint (so directly-routed
// packets are dropped at the same point whether or not the endpoint is currently
// the gateway - so, very slightly earlier when it's not the gateway).
//
// This function was a no-op if the gw_mode was "nat-unprotected" or "routed".
// It still is. but now always deletes the rule if it might have been created
// by an older version of the daemon.
//
// TODO(robmry) - remove this once there's no upgrade path from 28.0.x or 28.1.x.
func (n *Network) dropLegacyFilterDirectAccess(ctx context.Context, b types.PortBinding) error {
if rawRulesDisabled(ctx) {
return nil
}
@@ -232,7 +248,7 @@ func (n *Network) filterDirectAccess(ctx context.Context, b types.PortBinding, e
"!", "-i", n.IfName,
"-j", "DROP",
}}
if err := appendOrDelChainRule(drop, "DIRECT ACCESS FILTERING - DROP", enable); err != nil {
if err := appendOrDelChainRule(drop, "LEGACY DIRECT ACCESS FILTERING - DROP", false); err != nil {
return err
}