mirror of
https://github.com/moby/moby.git
synced 2026-01-11 18:51:37 +00:00
Create bridge veth in container netns
Since commit 933fcc9 (Re-remove the SetKey OCI prestart hook),
the network namespace will be set up before endpoints are
created in most cases, apart from build containers.
So, when possible, create the veth with one end in that netns
to save moving it in later. On my host, that saves about 20ms
for each bridge network a container is connected to.
Signed-off-by: Rob Murray <rob.murray@docker.com>
This commit is contained in:
@@ -912,6 +912,10 @@ func buildCreateEndpointOptions(c *container.Container, n *libnetwork.Network, e
|
||||
}
|
||||
}
|
||||
|
||||
if path, ok := sb.NetnsPath(); ok {
|
||||
createOptions = append(createOptions, libnetwork.WithNetnsPath(path))
|
||||
}
|
||||
|
||||
return createOptions, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -127,6 +127,14 @@ type InterfaceInfo interface {
|
||||
|
||||
// AddressIPv6 returns the IPv6 address.
|
||||
AddressIPv6() *net.IPNet
|
||||
|
||||
// NetnsPath returns the path of the network namespace, if there is one. Else "".
|
||||
NetnsPath() string
|
||||
|
||||
// SetCreatedInContainer can be called by the driver to indicate that it's
|
||||
// created the network interface in the container's network namespace (so,
|
||||
// it doesn't need to be moved there).
|
||||
SetCreatedInContainer(bool)
|
||||
}
|
||||
|
||||
// InterfaceNameInfo provides a go interface for the drivers to assign names
|
||||
|
||||
@@ -8,6 +8,7 @@ import (
|
||||
"os"
|
||||
"strconv"
|
||||
"sync"
|
||||
"syscall"
|
||||
|
||||
"github.com/containerd/log"
|
||||
"github.com/docker/docker/errdefs"
|
||||
@@ -26,6 +27,7 @@ import (
|
||||
"github.com/docker/docker/libnetwork/types"
|
||||
"github.com/pkg/errors"
|
||||
"github.com/vishvananda/netlink"
|
||||
"github.com/vishvananda/netns"
|
||||
"go.opentelemetry.io/otel"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/trace"
|
||||
@@ -1157,12 +1159,12 @@ func (d *driver) CreateEndpoint(ctx context.Context, nid, eid string, ifInfo dri
|
||||
}
|
||||
|
||||
// Generate and add the interface pipe host <-> sandbox
|
||||
veth := &netlink.Veth{
|
||||
LinkAttrs: netlink.LinkAttrs{Name: hostIfName, TxQLen: 0},
|
||||
PeerName: containerIfName,
|
||||
}
|
||||
if err = d.nlh.LinkAdd(veth); err != nil {
|
||||
return types.InternalErrorf("failed to add the host (%s) <=> sandbox (%s) pair interfaces: %v", hostIfName, containerIfName, err)
|
||||
nlhSb := d.nlh
|
||||
if nlh, err := createVeth(ctx, hostIfName, containerIfName, ifInfo, d.nlh); err != nil {
|
||||
return err
|
||||
} else if nlh != nil {
|
||||
defer nlh.Close()
|
||||
nlhSb = *nlh
|
||||
}
|
||||
|
||||
// Get the host side pipe interface handler
|
||||
@@ -1179,13 +1181,13 @@ func (d *driver) CreateEndpoint(ctx context.Context, nid, eid string, ifInfo dri
|
||||
}()
|
||||
|
||||
// Get the sandbox side pipe interface handler
|
||||
sbox, err := d.nlh.LinkByName(containerIfName)
|
||||
sbox, err := nlhSb.LinkByName(containerIfName)
|
||||
if err != nil {
|
||||
return types.InternalErrorf("failed to find sandbox side interface %s: %v", containerIfName, err)
|
||||
}
|
||||
defer func() {
|
||||
if err != nil {
|
||||
if err := d.nlh.LinkDel(sbox); err != nil {
|
||||
if err := nlhSb.LinkDel(sbox); err != nil {
|
||||
log.G(ctx).WithError(err).Warnf("Failed to delete sandbox side interface (%s)'s link", containerIfName)
|
||||
}
|
||||
}
|
||||
@@ -1201,7 +1203,7 @@ func (d *driver) CreateEndpoint(ctx context.Context, nid, eid string, ifInfo dri
|
||||
if err != nil {
|
||||
return types.InternalErrorf("failed to set MTU on host interface %s: %v", hostIfName, err)
|
||||
}
|
||||
err = d.nlh.LinkSetMTU(sbox, config.Mtu)
|
||||
err = nlhSb.LinkSetMTU(sbox, config.Mtu)
|
||||
if err != nil {
|
||||
return types.InternalErrorf("failed to set MTU on sandbox interface %s: %v", containerIfName, err)
|
||||
}
|
||||
@@ -1248,6 +1250,58 @@ func (d *driver) CreateEndpoint(ctx context.Context, nid, eid string, ifInfo dri
|
||||
return nil
|
||||
}
|
||||
|
||||
// createVeth creates a veth device with one end in the container's network namespace,
|
||||
// if it can get hold of the netns path and open the handles. In that case, it returns
|
||||
// a netlink handle in the container's namespace that must be closed by the caller.
|
||||
//
|
||||
// If the netns path isn't available, possibly because the netns hasn't been created
|
||||
// yet, or it's not possible to get a netns or netlink handle in the container's
|
||||
// namespace - both ends of the veth device are created in nlh's netns, and no netlink
|
||||
// handle is returned.
|
||||
//
|
||||
// (Only the error from creating the interface is returned. Failure to create the
|
||||
// interface in the container's netns is not an error.)
|
||||
func createVeth(ctx context.Context, hostIfName, containerIfName string, ifInfo driverapi.InterfaceInfo, nlh nlwrap.Handle) (nlhCtr *nlwrap.Handle, retErr error) {
|
||||
veth := &netlink.Veth{
|
||||
LinkAttrs: netlink.LinkAttrs{Name: hostIfName, TxQLen: 0},
|
||||
PeerName: containerIfName,
|
||||
}
|
||||
|
||||
if nspath := ifInfo.NetnsPath(); nspath == "" {
|
||||
log.G(ctx).WithField("ifname", containerIfName).Debug("No container netns path, creating interface in host netns")
|
||||
} else if netnsh, err := netns.GetFromPath(nspath); err != nil {
|
||||
log.G(ctx).WithFields(log.Fields{
|
||||
"error": err,
|
||||
"netns": nspath,
|
||||
"ifname": containerIfName,
|
||||
}).Warn("No container netns, creating interface in host netns")
|
||||
} else {
|
||||
defer netnsh.Close()
|
||||
|
||||
if nh, err := nlwrap.NewHandleAt(netnsh, syscall.NETLINK_ROUTE); err != nil {
|
||||
log.G(ctx).WithFields(log.Fields{
|
||||
"error": err,
|
||||
"netns": nspath,
|
||||
}).Warn("No netlink handle for container, creating interface in host netns")
|
||||
} else {
|
||||
defer func() {
|
||||
if retErr != nil {
|
||||
nh.Close()
|
||||
}
|
||||
}()
|
||||
|
||||
veth.PeerNamespace = netlink.NsFd(netnsh)
|
||||
nlhCtr = &nh
|
||||
ifInfo.SetCreatedInContainer(true)
|
||||
}
|
||||
}
|
||||
|
||||
if err := nlh.LinkAdd(veth); err != nil {
|
||||
return nil, types.InternalErrorf("failed to add the host (%s) <=> sandbox (%s) pair interfaces: %v", hostIfName, containerIfName, err)
|
||||
}
|
||||
return nlhCtr, nil
|
||||
}
|
||||
|
||||
func (d *driver) linkUp(ctx context.Context, host netlink.Link) error {
|
||||
ctx, span := otel.Tracer("").Start(ctx, "libnetwork.drivers.bridge.linkUp", trace.WithAttributes(
|
||||
attribute.String("host", host.Attrs().Name)))
|
||||
|
||||
@@ -26,9 +26,11 @@ import (
|
||||
"github.com/docker/docker/libnetwork/portallocator"
|
||||
"github.com/docker/docker/libnetwork/types"
|
||||
"github.com/vishvananda/netlink"
|
||||
"github.com/vishvananda/netns"
|
||||
"golang.org/x/sys/unix"
|
||||
"gotest.tools/v3/assert"
|
||||
is "gotest.tools/v3/assert/cmp"
|
||||
"gotest.tools/v3/icmd"
|
||||
)
|
||||
|
||||
func TestEndpointMarshalling(t *testing.T) {
|
||||
@@ -418,6 +420,84 @@ func TestCreateFullOptionsLabels(t *testing.T) {
|
||||
assert.Check(t, is.Equal(te2.iface.mac.String(), macAddr))
|
||||
}
|
||||
|
||||
func TestCreateVeth(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
netnsName string
|
||||
createNetns bool
|
||||
expCreatedInContainer bool
|
||||
}{
|
||||
{
|
||||
name: "host netns",
|
||||
},
|
||||
{
|
||||
name: "container netns",
|
||||
netnsName: "testnsctr",
|
||||
createNetns: true,
|
||||
expCreatedInContainer: true,
|
||||
},
|
||||
{
|
||||
name: "netns not created",
|
||||
netnsName: "testnsctr",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
// Create a "host" network namespace with a netlink handle.
|
||||
const hostNsName = "testnshost"
|
||||
res := icmd.RunCommand("ip", "netns", "add", hostNsName)
|
||||
assert.Assert(t, is.Equal(res.ExitCode, 0))
|
||||
defer icmd.RunCommand("ip", "netns", "del", hostNsName)
|
||||
nsh, err := netns.GetFromPath("/var/run/netns/" + hostNsName)
|
||||
assert.NilError(t, err)
|
||||
defer nsh.Close()
|
||||
nlh, err := nlwrap.NewHandleAt(nsh)
|
||||
assert.NilError(t, err)
|
||||
defer nlh.Close()
|
||||
|
||||
netnsPath := ""
|
||||
if tc.netnsName != "" {
|
||||
netnsPath = "/var/run/netns/" + tc.netnsName
|
||||
}
|
||||
if tc.createNetns {
|
||||
res := icmd.RunCommand("ip", "netns", "add", tc.netnsName)
|
||||
assert.Assert(t, is.Equal(res.ExitCode, 0))
|
||||
defer icmd.RunCommand("ip", "netns", "del", tc.netnsName)
|
||||
}
|
||||
|
||||
const hostIfName = "vethtesth"
|
||||
const containerIfName = "vethtestc"
|
||||
defer func() {
|
||||
// Just in case anything ends up in the host's netns, make sure it doesn't hang around ...
|
||||
icmd.RunCommand("ip", "link", "del", hostIfName)
|
||||
icmd.RunCommand("ip", "link", "del", containerIfName)
|
||||
}()
|
||||
|
||||
iface := &testInterface{netnsPath: netnsPath}
|
||||
nlhCtr, err := createVeth(context.Background(), hostIfName, containerIfName, iface, nlh)
|
||||
assert.Check(t, err)
|
||||
|
||||
assert.Check(t, is.Equal(iface.createdInContainer, tc.expCreatedInContainer))
|
||||
if tc.expCreatedInContainer {
|
||||
assert.Check(t, nlhCtr != nil)
|
||||
res := icmd.RunCommand("ip", "netns", "exec", hostNsName, "ip", "link", "show", hostIfName)
|
||||
assert.Check(t, is.Equal(res.ExitCode, 0))
|
||||
res = icmd.RunCommand("ip", "netns", "exec", hostNsName, "ip", "link", "show", containerIfName)
|
||||
assert.Check(t, is.Equal(res.ExitCode, 1))
|
||||
res = icmd.RunCommand("ip", "netns", "exec", tc.netnsName, "ip", "link", "show", containerIfName)
|
||||
assert.Check(t, is.Equal(res.ExitCode, 0))
|
||||
} else {
|
||||
assert.Check(t, nlhCtr == nil)
|
||||
res := icmd.RunCommand("ip", "netns", "exec", hostNsName, "ip", "link", "show", hostIfName)
|
||||
assert.Check(t, is.Equal(res.ExitCode, 0))
|
||||
res = icmd.RunCommand("ip", "netns", "exec", hostNsName, "ip", "link", "show", containerIfName)
|
||||
assert.Check(t, is.Equal(res.ExitCode, 0))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCreate(t *testing.T) {
|
||||
defer netnsutils.SetupTestOSContext(t)()
|
||||
|
||||
@@ -558,11 +638,13 @@ func verifyV4INCEntries(networks map[string]*bridgeNetwork, t *testing.T) {
|
||||
}
|
||||
|
||||
type testInterface struct {
|
||||
mac net.HardwareAddr
|
||||
addr *net.IPNet
|
||||
addrv6 *net.IPNet
|
||||
srcName string
|
||||
dstName string
|
||||
mac net.HardwareAddr
|
||||
addr *net.IPNet
|
||||
addrv6 *net.IPNet
|
||||
srcName string
|
||||
dstName string
|
||||
createdInContainer bool
|
||||
netnsPath string
|
||||
}
|
||||
|
||||
type testEndpoint struct {
|
||||
@@ -637,6 +719,14 @@ func setAddress(ifaceAddr **net.IPNet, address *net.IPNet) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (i *testInterface) NetnsPath() string {
|
||||
return i.netnsPath
|
||||
}
|
||||
|
||||
func (i *testInterface) SetCreatedInContainer(cic bool) {
|
||||
i.createdInContainer = cic
|
||||
}
|
||||
|
||||
func (i *testInterface) SetNames(srcName string, dstName string) error {
|
||||
i.srcName = srcName
|
||||
i.dstName = dstName
|
||||
|
||||
@@ -183,6 +183,10 @@ func (test *testEndpoint) SetGatewayIPv6(ipv6 net.IP) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (test *testEndpoint) NetnsPath() string { return "" }
|
||||
|
||||
func (test *testEndpoint) SetCreatedInContainer(bool) {}
|
||||
|
||||
func (test *testEndpoint) SetNames(src string, dst string) error {
|
||||
if test.src != src {
|
||||
test.t.Fatalf(`Wrong SrcName; expected "%s", got "%s"`, test.src, src)
|
||||
@@ -571,6 +575,10 @@ func (r *rollbackEndpoint) SetIPAddress(ip *net.IPNet) error {
|
||||
return errors.New("invalid ip")
|
||||
}
|
||||
|
||||
func (r *rollbackEndpoint) NetnsPath() string { return "" }
|
||||
|
||||
func (r *rollbackEndpoint) SetCreatedInContainer(bool) {}
|
||||
|
||||
func TestRollback(t *testing.T) {
|
||||
plugin := "test-net-driver-rollback"
|
||||
|
||||
|
||||
@@ -145,3 +145,10 @@ func (test *testEndpoint) AddStaticRoute(destination *net.IPNet, routeType int,
|
||||
func (test *testEndpoint) DisableGatewayService() {
|
||||
test.disableGatewayService = true
|
||||
}
|
||||
|
||||
func (test *testEndpoint) NetnsPath() string {
|
||||
return ""
|
||||
}
|
||||
|
||||
func (test *testEndpoint) SetCreatedInContainer(bool) {
|
||||
}
|
||||
|
||||
@@ -1254,6 +1254,12 @@ func JoinOptionPriority(prio int) EndpointOption {
|
||||
}
|
||||
}
|
||||
|
||||
func WithNetnsPath(path string) EndpointOption {
|
||||
return func(ep *Endpoint) {
|
||||
ep.iface.netnsPath = path
|
||||
}
|
||||
}
|
||||
|
||||
func (ep *Endpoint) assignAddress(ipam ipamapi.Ipam, assignIPv4, assignIPv6 bool) error {
|
||||
n := ep.getNetwork()
|
||||
if n.hasSpecialDriver() {
|
||||
|
||||
@@ -37,15 +37,17 @@ type EndpointInfo interface {
|
||||
|
||||
// EndpointInterface holds interface addresses bound to the endpoint.
|
||||
type EndpointInterface struct {
|
||||
mac net.HardwareAddr
|
||||
addr *net.IPNet
|
||||
addrv6 *net.IPNet
|
||||
llAddrs []*net.IPNet
|
||||
srcName string
|
||||
dstPrefix string
|
||||
routes []*net.IPNet
|
||||
v4PoolID string
|
||||
v6PoolID string
|
||||
mac net.HardwareAddr
|
||||
addr *net.IPNet
|
||||
addrv6 *net.IPNet
|
||||
llAddrs []*net.IPNet
|
||||
srcName string
|
||||
dstPrefix string
|
||||
routes []*net.IPNet
|
||||
v4PoolID string
|
||||
v6PoolID string
|
||||
netnsPath string
|
||||
createdInContainer bool
|
||||
}
|
||||
|
||||
func (epi *EndpointInterface) MarshalJSON() ([]byte, error) {
|
||||
@@ -75,6 +77,7 @@ func (epi *EndpointInterface) MarshalJSON() ([]byte, error) {
|
||||
epMap["routes"] = routes
|
||||
epMap["v4PoolID"] = epi.v4PoolID
|
||||
epMap["v6PoolID"] = epi.v6PoolID
|
||||
epMap["createdInContainer"] = epi.createdInContainer
|
||||
return json.Marshal(epMap)
|
||||
}
|
||||
|
||||
@@ -132,6 +135,9 @@ func (epi *EndpointInterface) UnmarshalJSON(b []byte) error {
|
||||
epi.v4PoolID = epMap["v4PoolID"].(string)
|
||||
epi.v6PoolID = epMap["v6PoolID"].(string)
|
||||
|
||||
if v, ok := epMap["createdInContainer"]; ok {
|
||||
epi.createdInContainer = v.(bool)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -143,6 +149,7 @@ func (epi *EndpointInterface) CopyTo(dstEpi *EndpointInterface) error {
|
||||
dstEpi.dstPrefix = epi.dstPrefix
|
||||
dstEpi.v4PoolID = epi.v4PoolID
|
||||
dstEpi.v6PoolID = epi.v6PoolID
|
||||
dstEpi.createdInContainer = epi.createdInContainer
|
||||
if len(epi.llAddrs) != 0 {
|
||||
dstEpi.llAddrs = make([]*net.IPNet, 0, len(epi.llAddrs))
|
||||
dstEpi.llAddrs = append(dstEpi.llAddrs, epi.llAddrs...)
|
||||
@@ -269,6 +276,18 @@ func (epi *EndpointInterface) SetNames(srcName string, dstPrefix string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// NetnsPath returns the path of the network namespace, if there is one. Else "".
|
||||
func (epi *EndpointInterface) NetnsPath() string {
|
||||
return epi.netnsPath
|
||||
}
|
||||
|
||||
// SetCreatedInContainer can be called by the driver to indicate that it's
|
||||
// created the network interface in the container's network namespace (so,
|
||||
// it doesn't need to be moved there).
|
||||
func (epi *EndpointInterface) SetCreatedInContainer(cic bool) {
|
||||
epi.createdInContainer = cic
|
||||
}
|
||||
|
||||
func (ep *Endpoint) InterfaceName() driverapi.InterfaceNameInfo {
|
||||
ep.mu.Lock()
|
||||
defer ep.mu.Unlock()
|
||||
|
||||
@@ -106,6 +106,7 @@ type Interface struct {
|
||||
// advertiseAddrInterval is the interval between unsolicited ARP/NA messages sent to
|
||||
// advertise the interface's addresses.
|
||||
advertiseAddrInterval time.Duration
|
||||
createdInContainer bool
|
||||
ns *Namespace
|
||||
}
|
||||
|
||||
@@ -265,7 +266,7 @@ func (n *Namespace) AddInterface(ctx context.Context, srcName, dstPrefix string,
|
||||
}); err != nil {
|
||||
return fmt.Errorf("failed to create bridge %q: %v", i.srcName, err)
|
||||
}
|
||||
} else {
|
||||
} else if !i.createdInContainer {
|
||||
// Find the network interface identified by the SrcName attribute.
|
||||
iface, err := nlhHost.LinkByName(i.srcName)
|
||||
if err != nil {
|
||||
|
||||
@@ -119,3 +119,13 @@ func WithAdvertiseAddrInterval(interval time.Duration) IfaceOption {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// WithCreatedInContainer can be used to say the network driver created the
|
||||
// interface in the container's network namespace (and, therefore, it doesn't
|
||||
// need to be moved into that namespace.)
|
||||
func WithCreatedInContainer(cic bool) IfaceOption {
|
||||
return func(i *Interface) error {
|
||||
i.createdInContainer = cic
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
@@ -207,6 +207,18 @@ func (sb *Sandbox) SetKey(ctx context.Context, basePath string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// NetnsPath returns the network namespace's path and true, if a network has been
|
||||
// created - else the empty string and false.
|
||||
func (sb *Sandbox) NetnsPath() (path string, ok bool) {
|
||||
sb.mu.Lock()
|
||||
osSbox := sb.osSbox
|
||||
sb.mu.Unlock()
|
||||
if osSbox == nil {
|
||||
return "", false
|
||||
}
|
||||
return osSbox.Key(), true
|
||||
}
|
||||
|
||||
// IPv6Enabled determines whether a container supports IPv6.
|
||||
// IPv6 support can always be determined for host networking. For other network
|
||||
// types it can only be determined once there's a container namespace to probe,
|
||||
@@ -348,6 +360,7 @@ func (sb *Sandbox) populateNetworkResources(ctx context.Context, ep *Endpoint) e
|
||||
ifaceOptions = append(ifaceOptions, osl.WithAdvertiseAddrInterval(interval))
|
||||
}
|
||||
}
|
||||
ifaceOptions = append(ifaceOptions, osl.WithCreatedInContainer(i.createdInContainer))
|
||||
|
||||
if err := sb.osSbox.AddInterface(ctx, i.srcName, i.dstPrefix, ifaceOptions...); err != nil {
|
||||
return fmt.Errorf("failed to add interface %s to sandbox: %v", i.srcName, err)
|
||||
|
||||
@@ -28,6 +28,11 @@ func (sb *Sandbox) restoreOslSandbox() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// NetnsPath is not implemented on Windows (Sandbox.osSbox is always nil)
|
||||
func (sb *Sandbox) NetnsPath() (path string, ok bool) {
|
||||
return "", false
|
||||
}
|
||||
|
||||
func (sb *Sandbox) populateNetworkResources(context.Context, *Endpoint) error {
|
||||
// not implemented on Windows (Sandbox.osSbox is always nil)
|
||||
return nil
|
||||
|
||||
Reference in New Issue
Block a user