Create bridge veth in container netns

Since commit 933fcc9 (Re-remove the SetKey OCI prestart hook),
the network namespace will be set up before endpoints are
created in most cases, apart from build containers.

So, when possible, create the veth with one end in that netns
to save moving it in later. On my host, that saves about 20ms
for each bridge network a container is connected to.

Signed-off-by: Rob Murray <rob.murray@docker.com>
This commit is contained in:
Rob Murray
2025-01-19 14:36:01 +00:00
parent b3b9e990ee
commit 65120d586b
12 changed files with 249 additions and 24 deletions

View File

@@ -912,6 +912,10 @@ func buildCreateEndpointOptions(c *container.Container, n *libnetwork.Network, e
}
}
if path, ok := sb.NetnsPath(); ok {
createOptions = append(createOptions, libnetwork.WithNetnsPath(path))
}
return createOptions, nil
}

View File

@@ -127,6 +127,14 @@ type InterfaceInfo interface {
// AddressIPv6 returns the IPv6 address.
AddressIPv6() *net.IPNet
// NetnsPath returns the path of the network namespace, if there is one. Else "".
NetnsPath() string
// SetCreatedInContainer can be called by the driver to indicate that it's
// created the network interface in the container's network namespace (so,
// it doesn't need to be moved there).
SetCreatedInContainer(bool)
}
// InterfaceNameInfo provides a go interface for the drivers to assign names

View File

@@ -8,6 +8,7 @@ import (
"os"
"strconv"
"sync"
"syscall"
"github.com/containerd/log"
"github.com/docker/docker/errdefs"
@@ -26,6 +27,7 @@ import (
"github.com/docker/docker/libnetwork/types"
"github.com/pkg/errors"
"github.com/vishvananda/netlink"
"github.com/vishvananda/netns"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/trace"
@@ -1157,12 +1159,12 @@ func (d *driver) CreateEndpoint(ctx context.Context, nid, eid string, ifInfo dri
}
// Generate and add the interface pipe host <-> sandbox
veth := &netlink.Veth{
LinkAttrs: netlink.LinkAttrs{Name: hostIfName, TxQLen: 0},
PeerName: containerIfName,
}
if err = d.nlh.LinkAdd(veth); err != nil {
return types.InternalErrorf("failed to add the host (%s) <=> sandbox (%s) pair interfaces: %v", hostIfName, containerIfName, err)
nlhSb := d.nlh
if nlh, err := createVeth(ctx, hostIfName, containerIfName, ifInfo, d.nlh); err != nil {
return err
} else if nlh != nil {
defer nlh.Close()
nlhSb = *nlh
}
// Get the host side pipe interface handler
@@ -1179,13 +1181,13 @@ func (d *driver) CreateEndpoint(ctx context.Context, nid, eid string, ifInfo dri
}()
// Get the sandbox side pipe interface handler
sbox, err := d.nlh.LinkByName(containerIfName)
sbox, err := nlhSb.LinkByName(containerIfName)
if err != nil {
return types.InternalErrorf("failed to find sandbox side interface %s: %v", containerIfName, err)
}
defer func() {
if err != nil {
if err := d.nlh.LinkDel(sbox); err != nil {
if err := nlhSb.LinkDel(sbox); err != nil {
log.G(ctx).WithError(err).Warnf("Failed to delete sandbox side interface (%s)'s link", containerIfName)
}
}
@@ -1201,7 +1203,7 @@ func (d *driver) CreateEndpoint(ctx context.Context, nid, eid string, ifInfo dri
if err != nil {
return types.InternalErrorf("failed to set MTU on host interface %s: %v", hostIfName, err)
}
err = d.nlh.LinkSetMTU(sbox, config.Mtu)
err = nlhSb.LinkSetMTU(sbox, config.Mtu)
if err != nil {
return types.InternalErrorf("failed to set MTU on sandbox interface %s: %v", containerIfName, err)
}
@@ -1248,6 +1250,58 @@ func (d *driver) CreateEndpoint(ctx context.Context, nid, eid string, ifInfo dri
return nil
}
// createVeth creates a veth device with one end in the container's network namespace,
// if it can get hold of the netns path and open the handles. In that case, it returns
// a netlink handle in the container's namespace that must be closed by the caller.
//
// If the netns path isn't available, possibly because the netns hasn't been created
// yet, or it's not possible to get a netns or netlink handle in the container's
// namespace - both ends of the veth device are created in nlh's netns, and no netlink
// handle is returned.
//
// (Only the error from creating the interface is returned. Failure to create the
// interface in the container's netns is not an error.)
func createVeth(ctx context.Context, hostIfName, containerIfName string, ifInfo driverapi.InterfaceInfo, nlh nlwrap.Handle) (nlhCtr *nlwrap.Handle, retErr error) {
veth := &netlink.Veth{
LinkAttrs: netlink.LinkAttrs{Name: hostIfName, TxQLen: 0},
PeerName: containerIfName,
}
if nspath := ifInfo.NetnsPath(); nspath == "" {
log.G(ctx).WithField("ifname", containerIfName).Debug("No container netns path, creating interface in host netns")
} else if netnsh, err := netns.GetFromPath(nspath); err != nil {
log.G(ctx).WithFields(log.Fields{
"error": err,
"netns": nspath,
"ifname": containerIfName,
}).Warn("No container netns, creating interface in host netns")
} else {
defer netnsh.Close()
if nh, err := nlwrap.NewHandleAt(netnsh, syscall.NETLINK_ROUTE); err != nil {
log.G(ctx).WithFields(log.Fields{
"error": err,
"netns": nspath,
}).Warn("No netlink handle for container, creating interface in host netns")
} else {
defer func() {
if retErr != nil {
nh.Close()
}
}()
veth.PeerNamespace = netlink.NsFd(netnsh)
nlhCtr = &nh
ifInfo.SetCreatedInContainer(true)
}
}
if err := nlh.LinkAdd(veth); err != nil {
return nil, types.InternalErrorf("failed to add the host (%s) <=> sandbox (%s) pair interfaces: %v", hostIfName, containerIfName, err)
}
return nlhCtr, nil
}
func (d *driver) linkUp(ctx context.Context, host netlink.Link) error {
ctx, span := otel.Tracer("").Start(ctx, "libnetwork.drivers.bridge.linkUp", trace.WithAttributes(
attribute.String("host", host.Attrs().Name)))

View File

@@ -26,9 +26,11 @@ import (
"github.com/docker/docker/libnetwork/portallocator"
"github.com/docker/docker/libnetwork/types"
"github.com/vishvananda/netlink"
"github.com/vishvananda/netns"
"golang.org/x/sys/unix"
"gotest.tools/v3/assert"
is "gotest.tools/v3/assert/cmp"
"gotest.tools/v3/icmd"
)
func TestEndpointMarshalling(t *testing.T) {
@@ -418,6 +420,84 @@ func TestCreateFullOptionsLabels(t *testing.T) {
assert.Check(t, is.Equal(te2.iface.mac.String(), macAddr))
}
func TestCreateVeth(t *testing.T) {
tests := []struct {
name string
netnsName string
createNetns bool
expCreatedInContainer bool
}{
{
name: "host netns",
},
{
name: "container netns",
netnsName: "testnsctr",
createNetns: true,
expCreatedInContainer: true,
},
{
name: "netns not created",
netnsName: "testnsctr",
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
// Create a "host" network namespace with a netlink handle.
const hostNsName = "testnshost"
res := icmd.RunCommand("ip", "netns", "add", hostNsName)
assert.Assert(t, is.Equal(res.ExitCode, 0))
defer icmd.RunCommand("ip", "netns", "del", hostNsName)
nsh, err := netns.GetFromPath("/var/run/netns/" + hostNsName)
assert.NilError(t, err)
defer nsh.Close()
nlh, err := nlwrap.NewHandleAt(nsh)
assert.NilError(t, err)
defer nlh.Close()
netnsPath := ""
if tc.netnsName != "" {
netnsPath = "/var/run/netns/" + tc.netnsName
}
if tc.createNetns {
res := icmd.RunCommand("ip", "netns", "add", tc.netnsName)
assert.Assert(t, is.Equal(res.ExitCode, 0))
defer icmd.RunCommand("ip", "netns", "del", tc.netnsName)
}
const hostIfName = "vethtesth"
const containerIfName = "vethtestc"
defer func() {
// Just in case anything ends up in the host's netns, make sure it doesn't hang around ...
icmd.RunCommand("ip", "link", "del", hostIfName)
icmd.RunCommand("ip", "link", "del", containerIfName)
}()
iface := &testInterface{netnsPath: netnsPath}
nlhCtr, err := createVeth(context.Background(), hostIfName, containerIfName, iface, nlh)
assert.Check(t, err)
assert.Check(t, is.Equal(iface.createdInContainer, tc.expCreatedInContainer))
if tc.expCreatedInContainer {
assert.Check(t, nlhCtr != nil)
res := icmd.RunCommand("ip", "netns", "exec", hostNsName, "ip", "link", "show", hostIfName)
assert.Check(t, is.Equal(res.ExitCode, 0))
res = icmd.RunCommand("ip", "netns", "exec", hostNsName, "ip", "link", "show", containerIfName)
assert.Check(t, is.Equal(res.ExitCode, 1))
res = icmd.RunCommand("ip", "netns", "exec", tc.netnsName, "ip", "link", "show", containerIfName)
assert.Check(t, is.Equal(res.ExitCode, 0))
} else {
assert.Check(t, nlhCtr == nil)
res := icmd.RunCommand("ip", "netns", "exec", hostNsName, "ip", "link", "show", hostIfName)
assert.Check(t, is.Equal(res.ExitCode, 0))
res = icmd.RunCommand("ip", "netns", "exec", hostNsName, "ip", "link", "show", containerIfName)
assert.Check(t, is.Equal(res.ExitCode, 0))
}
})
}
}
func TestCreate(t *testing.T) {
defer netnsutils.SetupTestOSContext(t)()
@@ -558,11 +638,13 @@ func verifyV4INCEntries(networks map[string]*bridgeNetwork, t *testing.T) {
}
type testInterface struct {
mac net.HardwareAddr
addr *net.IPNet
addrv6 *net.IPNet
srcName string
dstName string
mac net.HardwareAddr
addr *net.IPNet
addrv6 *net.IPNet
srcName string
dstName string
createdInContainer bool
netnsPath string
}
type testEndpoint struct {
@@ -637,6 +719,14 @@ func setAddress(ifaceAddr **net.IPNet, address *net.IPNet) error {
return nil
}
func (i *testInterface) NetnsPath() string {
return i.netnsPath
}
func (i *testInterface) SetCreatedInContainer(cic bool) {
i.createdInContainer = cic
}
func (i *testInterface) SetNames(srcName string, dstName string) error {
i.srcName = srcName
i.dstName = dstName

View File

@@ -183,6 +183,10 @@ func (test *testEndpoint) SetGatewayIPv6(ipv6 net.IP) error {
return nil
}
func (test *testEndpoint) NetnsPath() string { return "" }
func (test *testEndpoint) SetCreatedInContainer(bool) {}
func (test *testEndpoint) SetNames(src string, dst string) error {
if test.src != src {
test.t.Fatalf(`Wrong SrcName; expected "%s", got "%s"`, test.src, src)
@@ -571,6 +575,10 @@ func (r *rollbackEndpoint) SetIPAddress(ip *net.IPNet) error {
return errors.New("invalid ip")
}
func (r *rollbackEndpoint) NetnsPath() string { return "" }
func (r *rollbackEndpoint) SetCreatedInContainer(bool) {}
func TestRollback(t *testing.T) {
plugin := "test-net-driver-rollback"

View File

@@ -145,3 +145,10 @@ func (test *testEndpoint) AddStaticRoute(destination *net.IPNet, routeType int,
func (test *testEndpoint) DisableGatewayService() {
test.disableGatewayService = true
}
func (test *testEndpoint) NetnsPath() string {
return ""
}
func (test *testEndpoint) SetCreatedInContainer(bool) {
}

View File

@@ -1254,6 +1254,12 @@ func JoinOptionPriority(prio int) EndpointOption {
}
}
func WithNetnsPath(path string) EndpointOption {
return func(ep *Endpoint) {
ep.iface.netnsPath = path
}
}
func (ep *Endpoint) assignAddress(ipam ipamapi.Ipam, assignIPv4, assignIPv6 bool) error {
n := ep.getNetwork()
if n.hasSpecialDriver() {

View File

@@ -37,15 +37,17 @@ type EndpointInfo interface {
// EndpointInterface holds interface addresses bound to the endpoint.
type EndpointInterface struct {
mac net.HardwareAddr
addr *net.IPNet
addrv6 *net.IPNet
llAddrs []*net.IPNet
srcName string
dstPrefix string
routes []*net.IPNet
v4PoolID string
v6PoolID string
mac net.HardwareAddr
addr *net.IPNet
addrv6 *net.IPNet
llAddrs []*net.IPNet
srcName string
dstPrefix string
routes []*net.IPNet
v4PoolID string
v6PoolID string
netnsPath string
createdInContainer bool
}
func (epi *EndpointInterface) MarshalJSON() ([]byte, error) {
@@ -75,6 +77,7 @@ func (epi *EndpointInterface) MarshalJSON() ([]byte, error) {
epMap["routes"] = routes
epMap["v4PoolID"] = epi.v4PoolID
epMap["v6PoolID"] = epi.v6PoolID
epMap["createdInContainer"] = epi.createdInContainer
return json.Marshal(epMap)
}
@@ -132,6 +135,9 @@ func (epi *EndpointInterface) UnmarshalJSON(b []byte) error {
epi.v4PoolID = epMap["v4PoolID"].(string)
epi.v6PoolID = epMap["v6PoolID"].(string)
if v, ok := epMap["createdInContainer"]; ok {
epi.createdInContainer = v.(bool)
}
return nil
}
@@ -143,6 +149,7 @@ func (epi *EndpointInterface) CopyTo(dstEpi *EndpointInterface) error {
dstEpi.dstPrefix = epi.dstPrefix
dstEpi.v4PoolID = epi.v4PoolID
dstEpi.v6PoolID = epi.v6PoolID
dstEpi.createdInContainer = epi.createdInContainer
if len(epi.llAddrs) != 0 {
dstEpi.llAddrs = make([]*net.IPNet, 0, len(epi.llAddrs))
dstEpi.llAddrs = append(dstEpi.llAddrs, epi.llAddrs...)
@@ -269,6 +276,18 @@ func (epi *EndpointInterface) SetNames(srcName string, dstPrefix string) error {
return nil
}
// NetnsPath returns the path of the network namespace, if there is one. Else "".
func (epi *EndpointInterface) NetnsPath() string {
return epi.netnsPath
}
// SetCreatedInContainer can be called by the driver to indicate that it's
// created the network interface in the container's network namespace (so,
// it doesn't need to be moved there).
func (epi *EndpointInterface) SetCreatedInContainer(cic bool) {
epi.createdInContainer = cic
}
func (ep *Endpoint) InterfaceName() driverapi.InterfaceNameInfo {
ep.mu.Lock()
defer ep.mu.Unlock()

View File

@@ -106,6 +106,7 @@ type Interface struct {
// advertiseAddrInterval is the interval between unsolicited ARP/NA messages sent to
// advertise the interface's addresses.
advertiseAddrInterval time.Duration
createdInContainer bool
ns *Namespace
}
@@ -265,7 +266,7 @@ func (n *Namespace) AddInterface(ctx context.Context, srcName, dstPrefix string,
}); err != nil {
return fmt.Errorf("failed to create bridge %q: %v", i.srcName, err)
}
} else {
} else if !i.createdInContainer {
// Find the network interface identified by the SrcName attribute.
iface, err := nlhHost.LinkByName(i.srcName)
if err != nil {

View File

@@ -119,3 +119,13 @@ func WithAdvertiseAddrInterval(interval time.Duration) IfaceOption {
return nil
}
}
// WithCreatedInContainer can be used to say the network driver created the
// interface in the container's network namespace (and, therefore, it doesn't
// need to be moved into that namespace.)
func WithCreatedInContainer(cic bool) IfaceOption {
return func(i *Interface) error {
i.createdInContainer = cic
return nil
}
}

View File

@@ -207,6 +207,18 @@ func (sb *Sandbox) SetKey(ctx context.Context, basePath string) error {
return nil
}
// NetnsPath returns the network namespace's path and true, if a network has been
// created - else the empty string and false.
func (sb *Sandbox) NetnsPath() (path string, ok bool) {
sb.mu.Lock()
osSbox := sb.osSbox
sb.mu.Unlock()
if osSbox == nil {
return "", false
}
return osSbox.Key(), true
}
// IPv6Enabled determines whether a container supports IPv6.
// IPv6 support can always be determined for host networking. For other network
// types it can only be determined once there's a container namespace to probe,
@@ -348,6 +360,7 @@ func (sb *Sandbox) populateNetworkResources(ctx context.Context, ep *Endpoint) e
ifaceOptions = append(ifaceOptions, osl.WithAdvertiseAddrInterval(interval))
}
}
ifaceOptions = append(ifaceOptions, osl.WithCreatedInContainer(i.createdInContainer))
if err := sb.osSbox.AddInterface(ctx, i.srcName, i.dstPrefix, ifaceOptions...); err != nil {
return fmt.Errorf("failed to add interface %s to sandbox: %v", i.srcName, err)

View File

@@ -28,6 +28,11 @@ func (sb *Sandbox) restoreOslSandbox() error {
return nil
}
// NetnsPath is not implemented on Windows (Sandbox.osSbox is always nil)
func (sb *Sandbox) NetnsPath() (path string, ok bool) {
return "", false
}
func (sb *Sandbox) populateNetworkResources(context.Context, *Endpoint) error {
// not implemented on Windows (Sandbox.osSbox is always nil)
return nil