From 59437f56f9f8d1aa9b08596898c6df4778f72e2a Mon Sep 17 00:00:00 2001 From: Cory Snider Date: Tue, 20 May 2025 15:09:54 -0400 Subject: [PATCH 1/3] libnetwork/d/overlay: refactor peer db impl The peer db implementation is more complex than it needs to be. Notably, the peerCRUD / peerCRUDOp function split is a vestige of its evolution from a worker goroutine receiving commands over a channel. Refactor the peer db operations to be easier to read, understand and modify. Factor the kernel-programming operations out into dedicated addNeighbor and deleteNeighbor functions. Inline the rest of the peerCRUDOp functions into their respective peerCRUD wrappers. Signed-off-by: Cory Snider --- libnetwork/drivers/overlay/peerdb.go | 203 ++++++++++++++------------- 1 file changed, 104 insertions(+), 99 deletions(-) diff --git a/libnetwork/drivers/overlay/peerdb.go b/libnetwork/drivers/overlay/peerdb.go index 39dae6617b..5c103ad8e5 100644 --- a/libnetwork/drivers/overlay/peerdb.go +++ b/libnetwork/drivers/overlay/peerdb.go @@ -5,6 +5,7 @@ package overlay import ( "context" + "errors" "fmt" "net" "net/netip" @@ -156,63 +157,57 @@ func (d *driver) peerDbDelete(nid, eid string, peerIP netip.Prefix, peerMac net. // networkDB has already delivered some events of peers already available on remote nodes, // these peers are saved into the peerDB and this function is used to properly configure // the network sandbox with all those peers that got previously notified. -// Note also that this method sends a single message on the channel and the go routine on the -// other side, will atomically loop on the whole table of peers and will program their state -// in one single atomic operation. This is fundamental to guarantee consistency, and avoid that +// Note also that this method atomically loops on the whole table of peers +// and programs their state in one single atomic operation. +// This is fundamental to guarantee consistency, and avoid that // new peerAdd or peerDelete gets reordered during the sandbox init. func (d *driver) initSandboxPeerDB(nid string) { d.peerOpMu.Lock() defer d.peerOpMu.Unlock() - if err := d.peerInitOp(nid); err != nil { + err := d.peerDbNetworkWalk(nid, func(peerIP netip.Addr, peerMac net.HardwareAddr, pEntry *peerEntry) bool { + if !pEntry.isLocal() { + d.addNeighbor(nid, netip.PrefixFrom(peerIP, pEntry.prefixBits), peerMac, pEntry.vtep) + } + return false // walk all entries + }) + + if err != nil { log.G(context.TODO()).WithError(err).Warn("Peer init operation failed") } } -func (d *driver) peerInitOp(nid string) error { - return d.peerDbNetworkWalk(nid, func(peerIP netip.Addr, peerMac net.HardwareAddr, pEntry *peerEntry) bool { - // Local entries do not need to be added - if pEntry.isLocal() { - return false - } - - d.peerAddOp(nid, pEntry.eid, netip.PrefixFrom(peerIP, pEntry.prefixBits), peerMac, pEntry.vtep, false) - // return false to loop on all entries - return false - }) -} - // peerAdd adds a new entry to the peer database. // // Local peers are signified by an invalid vtep (i.e. netip.Addr{}). func (d *driver) peerAdd(nid, eid string, peerIP netip.Prefix, peerMac net.HardwareAddr, vtep netip.Addr) { + if err := validateID(nid, eid); err != nil { + log.G(context.TODO()).WithError(err).Warn("Peer add operation failed") + return + } + d.peerOpMu.Lock() defer d.peerOpMu.Unlock() - err := d.peerAddOp(nid, eid, peerIP, peerMac, vtep, true) - if err != nil { - log.G(context.TODO()).WithError(err).Warn("Peer add operation failed") + + inserted, dbEntries := d.peerDbAdd(nid, eid, peerIP, peerMac, vtep) + if !inserted { + log.G(context.TODO()).Warnf("Entry already present in db: nid:%s eid:%s peerIP:%v peerMac:%v vtep:%v", + nid, eid, peerIP, peerMac, vtep) + } + if vtep.IsValid() { + err := d.addNeighbor(nid, peerIP, peerMac, vtep) + if err != nil { + if dbEntries > 1 && errors.As(err, &osl.NeighborSearchError{}) { + // We are in the transient case so only the first configuration is programmed into the kernel. + // Upon deletion if the active configuration is deleted the next one from the database will be restored. + return + } + log.G(context.TODO()).WithFields(log.Fields{"nid": nid, "eid": eid}).WithError(err).Warn("Peer add operation failed") + } } } -func (d *driver) peerAddOp(nid, eid string, peerIP netip.Prefix, peerMac net.HardwareAddr, vtep netip.Addr, updateDB bool) error { - if err := validateID(nid, eid); err != nil { - return err - } - - var dbEntries int - var inserted bool - if updateDB { - inserted, dbEntries = d.peerDbAdd(nid, eid, peerIP, peerMac, vtep) - if !inserted { - log.G(context.TODO()).Warnf("Entry already present in db: nid:%s eid:%s peerIP:%v peerMac:%v vtep:%v", - nid, eid, peerIP, peerMac, vtep) - } - } - - // Local peers do not need any further configuration - if !vtep.IsValid() { - return nil - } - +// addNeighbor programs the kernel so the given peer is reachable through the VXLAN tunnel. +func (d *driver) addNeighbor(nid string, peerIP netip.Prefix, peerMac net.HardwareAddr, vtep netip.Addr) error { n := d.network(nid) if n == nil { return nil @@ -243,18 +238,12 @@ func (d *driver) peerAddOp(nid, eid string, peerIP netip.Prefix, peerMac net.Har // Add neighbor entry for the peer IP if err := sbox.AddNeighbor(peerIP.Addr().AsSlice(), peerMac, osl.WithLinkName(s.vxlanName)); err != nil { - if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 1 { - // We are in the transient case so only the first configuration is programmed into the kernel - // Upon deletion if the active configuration is deleted the next one from the database will be restored - // Note we are skipping also the next configuration - return nil - } - return fmt.Errorf("could not add neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) + return fmt.Errorf("could not add neighbor entry into the sandbox: %w", err) } // Add fdb entry to the bridge for the peer mac if err := sbox.AddNeighbor(vtep.AsSlice(), peerMac, osl.WithLinkName(s.vxlanName), osl.WithFamily(syscall.AF_BRIDGE)); err != nil { - return fmt.Errorf("could not add fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) + return fmt.Errorf("could not add fdb entry into the sandbox: %w", err) } return nil @@ -264,25 +253,67 @@ func (d *driver) peerAddOp(nid, eid string, peerIP netip.Prefix, peerMac net.Har // // Local peers are signified by an invalid vtep (i.e. netip.Addr{}). func (d *driver) peerDelete(nid, eid string, peerIP netip.Prefix, peerMac net.HardwareAddr, vtep netip.Addr) { + logger := log.G(context.TODO()).WithFields(log.Fields{ + "nid": nid, + "eid": eid, + "ip": peerIP, + "mac": peerMac, + "vtep": vtep, + }) + if err := validateID(nid, eid); err != nil { + logger.WithError(err).Warn("Peer delete operation failed") + return + } + d.peerOpMu.Lock() defer d.peerOpMu.Unlock() - err := d.peerDeleteOp(nid, eid, peerIP, peerMac, vtep) - if err != nil { - log.G(context.TODO()).WithError(err).Warn("Peer delete operation failed") - } -} - -func (d *driver) peerDeleteOp(nid, eid string, peerIP netip.Prefix, peerMac net.HardwareAddr, vtep netip.Addr) error { - if err := validateID(nid, eid); err != nil { - return err - } deleted, dbEntries := d.peerDbDelete(nid, eid, peerIP, peerMac, vtep) if !deleted { - log.G(context.TODO()).Warnf("Entry was not in db: nid:%s eid:%s peerIP:%v peerMac:%v vtep:%v", - nid, eid, peerIP, peerMac, vtep) + logger.Warn("Peer entry was not in db") } + if vtep.IsValid() { + err := d.deleteNeighbor(nid, peerIP, peerMac, vtep) + if err != nil { + if dbEntries > 0 && errors.As(err, &osl.NeighborSearchError{}) { + // We fall in here if there is a transient state and if the neighbor that is being deleted + // was never been configured into the kernel (we allow only 1 configuration at the time per mapping) + return + } + logger.WithError(err).Warn("Peer delete operation failed") + } + + if dbEntries > 0 { + // If there is still an entry into the database and the deletion went through without errors means that there is now no + // configuration active in the kernel. + // Restore one configuration for the directly from the database, note that is guaranteed that there is one + peerIPAddr, peerMac, peerEntry, err := d.peerDbSearch(nid, peerIP.Addr()) + if err != nil { + log.G(context.TODO()).WithFields(log.Fields{ + "nid": nid, + "ip": peerIP, + }).WithError(err).Error("peerDelete unable to restore a configuration") + return + } + peerIP = netip.PrefixFrom(peerIPAddr, peerEntry.prefixBits) + err = d.addNeighbor(nid, peerIP, peerMac, peerEntry.vtep) + if err != nil { + log.G(context.TODO()).WithFields(log.Fields{ + "nid": nid, + "eid": eid, + "ip": peerIP, + "mac": peerMac, + "vtep": peerEntry.vtep, + }).WithError(err).Error("Peer delete operation failed") + } + } + } +} + +// deleteNeighbor removes programming from the kernel for the given peer to be +// reachable through the VXLAN tunnel. It is the inverse of [driver.addNeighbor]. +func (d *driver) deleteNeighbor(nid string, peerIP netip.Prefix, peerMac net.HardwareAddr, vtep netip.Addr) error { n := d.network(nid) if n == nil { return nil @@ -299,58 +330,32 @@ func (d *driver) peerDeleteOp(nid, eid string, peerIP netip.Prefix, peerMac net. } } - // Local peers do not have any local configuration to delete - if vtep.IsValid() { - s := n.getSubnetforIP(peerIP) - if s == nil { - return fmt.Errorf("could not find the subnet %q in network %q", peerIP.String(), n.id) - } - // Remove fdb entry to the bridge for the peer mac - if err := sbox.DeleteNeighbor(vtep.AsSlice(), peerMac, osl.WithLinkName(s.vxlanName), osl.WithFamily(syscall.AF_BRIDGE)); err != nil { - if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 0 { - // We fall in here if there is a transient state and if the neighbor that is being deleted - // was never been configured into the kernel (we allow only 1 configuration at the time per mapping) - return nil - } - return fmt.Errorf("could not delete fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) - } - - // Delete neighbor entry for the peer IP - if err := sbox.DeleteNeighbor(peerIP.Addr().AsSlice(), peerMac, osl.WithLinkName(s.vxlanName)); err != nil { - return fmt.Errorf("could not delete neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) - } + s := n.getSubnetforIP(peerIP) + if s == nil { + return fmt.Errorf("could not find the subnet %q in network %q", peerIP.String(), n.id) + } + // Remove fdb entry to the bridge for the peer mac + if err := sbox.DeleteNeighbor(vtep.AsSlice(), peerMac, osl.WithLinkName(s.vxlanName), osl.WithFamily(syscall.AF_BRIDGE)); err != nil { + return fmt.Errorf("could not delete fdb entry in the sandbox: %w", err) } - if dbEntries == 0 { - return nil + // Delete neighbor entry for the peer IP + if err := sbox.DeleteNeighbor(peerIP.Addr().AsSlice(), peerMac, osl.WithLinkName(s.vxlanName)); err != nil { + return fmt.Errorf("could not delete neighbor entry in the sandbox:%v", err) } - // If there is still an entry into the database and the deletion went through without errors means that there is now no - // configuration active in the kernel. - // Restore one configuration for the directly from the database, note that is guaranteed that there is one - peerIPAddr, peerMac, peerEntry, err := d.peerDbSearch(nid, peerIP.Addr()) - if err != nil { - log.G(context.TODO()).Errorf("peerDeleteOp unable to restore a configuration for nid:%s ip:%v mac:%v err:%s", nid, peerIP, peerMac, err) - return err - } - return d.peerAddOp(nid, peerEntry.eid, netip.PrefixFrom(peerIPAddr, peerEntry.prefixBits), peerMac, peerEntry.vtep, false) + return nil } func (d *driver) peerFlush(nid string) { d.peerOpMu.Lock() defer d.peerOpMu.Unlock() - if err := d.peerFlushOp(nid); err != nil { - log.G(context.TODO()).WithError(err).Warn("Peer flush operation failed") - } -} - -func (d *driver) peerFlushOp(nid string) error { d.peerDb.Lock() defer d.peerDb.Unlock() _, ok := d.peerDb.mp[nid] if !ok { - return fmt.Errorf("Unable to find the peerDB for nid:%s", nid) + log.G(context.TODO()).Warnf("Peer flush operation failed: unable to find the peerDB for nid:%s", nid) + return } delete(d.peerDb.mp, nid) - return nil } From 1c2b744ca249fac01383f1b7abbf267d70df3e39 Mon Sep 17 00:00:00 2001 From: Cory Snider Date: Wed, 21 May 2025 14:48:41 -0400 Subject: [PATCH 2/3] libnetwork/d/overlay: properly model peer db The overlay driver assumes that the peer table in NetworkDB will always converge to a 1:1:1 mapping from peer endpoint IP address to MAC address to VTEP. While this currently holds true in practice most of the time, it is not an invariant and there are ways that users can violate this assumption. The driver detects whether peer entries conflict with each other by matching up (IP, MAC) tuples. In the common case this works out fine as the MAC address for an endpoint is generally derived from the assigned IP address. If an IP address gets reassigned to a container on another node the MAC address will follow, so the driver's conflict resolution logic will behave as intended. However users may explicitly configure the MAC address for a container's network endpoints. If an IP address gets reassigned from a container with an auto-generated MAC address to a container with a manually-configured MAC, or vice versa, the driver would not detect the conflict as the (IP, MAC) tuples won't match up. It would attempt to program the kernel's neighbor table with two conflicting MAC addresses for one IP, which will fail. And since it does not realize that there is a conflict, the driver won't reprogram the kernel from the remaining entry when the other entry is deleted. The assumption that only one IP address may resolve to a given MAC address is violated if multiple IP addresses are assigned to an endpoint. This rarely comes up in practice today as the overlay driver only supports IPv4 single-stack connectivity for endpoints. If multiple distinct peer entries exist with the same MAC address, the driver will delete the MAC->VTEP mapping from the kernel's forwarding database when any entry is deleted, even if other entries remain active. This limitation is one of the biggest obstacles in the way of supporting IPv6 and dual-stack connectivity for endpoints attached to overlay networks. Modify the peer db logic to correctly handle the cases where peer entries have non-unique MAC or VTEP values. Treat any set of entries with non-unique IP addresses as a conflict, irrespective of the entries' MAC addresses. Maintain a reference count of forwarding database entries and only delete the MAC->VTEP mapping from the kernel when there are no longer any neighbor entries which resolve to that MAC. Signed-off-by: Cory Snider --- libnetwork/drivers/overlay/ov_network.go | 8 +- libnetwork/drivers/overlay/peerdb.go | 121 ++++++++---------- libnetwork/internal/countmap/countmap.go | 19 +++ libnetwork/internal/countmap/countmap_test.go | 27 ++++ 4 files changed, 106 insertions(+), 69 deletions(-) create mode 100644 libnetwork/internal/countmap/countmap.go create mode 100644 libnetwork/internal/countmap/countmap_test.go diff --git a/libnetwork/drivers/overlay/ov_network.go b/libnetwork/drivers/overlay/ov_network.go index 5b533f481e..929713c330 100644 --- a/libnetwork/drivers/overlay/ov_network.go +++ b/libnetwork/drivers/overlay/ov_network.go @@ -1,4 +1,5 @@ -//go:build linux +// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16: +//go:build go1.23 && linux package overlay @@ -18,6 +19,7 @@ import ( "github.com/docker/docker/internal/nlwrap" "github.com/docker/docker/libnetwork/driverapi" "github.com/docker/docker/libnetwork/drivers/overlay/overlayutils" + "github.com/docker/docker/libnetwork/internal/countmap" "github.com/docker/docker/libnetwork/internal/netiputil" "github.com/docker/docker/libnetwork/netlabel" "github.com/docker/docker/libnetwork/ns" @@ -53,6 +55,8 @@ type network struct { endpoints endpointTable driver *driver joinCnt int + // Ref count of VXLAN Forwarding Database entries programmed into the kernel + fdbCnt countmap.Map[ipmac] sboxInit bool initEpoch int initErr error @@ -99,6 +103,7 @@ func (d *driver) CreateNetwork(ctx context.Context, id string, option map[string driver: d, endpoints: endpointTable{}, subnets: []*subnet{}, + fdbCnt: countmap.Map[ipmac]{}, } vnis := make([]uint32, 0, len(ipV4Data)) @@ -586,6 +591,7 @@ func (n *network) initSandbox() error { // this is needed to let the peerAdd configure the sandbox n.sbox = sbox + n.fdbCnt = countmap.Map[ipmac]{} return nil } diff --git a/libnetwork/drivers/overlay/peerdb.go b/libnetwork/drivers/overlay/peerdb.go index 5c103ad8e5..d655b0c4d7 100644 --- a/libnetwork/drivers/overlay/peerdb.go +++ b/libnetwork/drivers/overlay/peerdb.go @@ -20,9 +20,9 @@ import ( const ovPeerTable = "overlay_peer_table" type peerEntry struct { - eid string - vtep netip.Addr // Virtual Tunnel End Point for non-local peers - prefixBits int // number of 1-bits in network mask of peerIP + eid string + mac macAddr + vtep netip.Addr } func (p *peerEntry) isLocal() bool { @@ -30,8 +30,7 @@ func (p *peerEntry) isLocal() bool { } type peerMap struct { - // set of peerEntry, note the values have to be objects and not pointers to maintain the proper equality checks - mp setmatrix.SetMatrix[ipmac, peerEntry] + mp setmatrix.SetMatrix[netip.Prefix, peerEntry] sync.Mutex } @@ -41,16 +40,16 @@ type peerNetworkMap struct { sync.Mutex } -func (d *driver) peerDbNetworkWalk(nid string, f func(netip.Addr, net.HardwareAddr, *peerEntry) bool) error { +func (d *driver) peerDbNetworkWalk(nid string, f func(netip.Prefix, peerEntry) bool) { d.peerDb.Lock() pMap, ok := d.peerDb.mp[nid] d.peerDb.Unlock() if !ok { - return nil + return } - mp := map[ipmac]peerEntry{} + mp := map[netip.Prefix]peerEntry{} pMap.Lock() for _, pKey := range pMap.mp.Keys() { entryDBList, ok := pMap.mp.Get(pKey) @@ -60,38 +59,28 @@ func (d *driver) peerDbNetworkWalk(nid string, f func(netip.Addr, net.HardwareAd } pMap.Unlock() - for pKey, pEntry := range mp { - if f(pKey.ip, pKey.mac.HardwareAddr(), &pEntry) { - return nil + for k, v := range mp { + if f(k, v) { + return } } - - return nil } -func (d *driver) peerDbSearch(nid string, peerIP netip.Addr) (netip.Addr, net.HardwareAddr, *peerEntry, error) { - var peerIPMatched netip.Addr - var peerMacMatched net.HardwareAddr - var pEntryMatched *peerEntry - err := d.peerDbNetworkWalk(nid, func(ip netip.Addr, mac net.HardwareAddr, pEntry *peerEntry) bool { - if ip == peerIP { - peerIPMatched = ip - peerMacMatched = mac - pEntryMatched = pEntry - return true - } - - return false - }) - if err != nil { - return netip.Addr{}, nil, nil, fmt.Errorf("peerdb search for peer ip %q failed: %v", peerIP, err) +func (d *driver) peerDbGet(nid string, peerIP netip.Prefix) (peerEntry, bool) { + d.peerDb.Lock() + pMap, ok := d.peerDb.mp[nid] + d.peerDb.Unlock() + if !ok { + return peerEntry{}, false } - if !peerIPMatched.IsValid() || pEntryMatched == nil { - return netip.Addr{}, nil, nil, fmt.Errorf("peer ip %q not found in peerdb", peerIP) + pMap.Lock() + defer pMap.Unlock() + c, _ := pMap.mp.Get(peerIP) + if len(c) == 0 { + return peerEntry{}, false } - - return peerIPMatched, peerMacMatched, pEntryMatched, nil + return c[0], true } func (d *driver) peerDbAdd(nid, eid string, peerIP netip.Prefix, peerMac net.HardwareAddr, vtep netip.Addr) (bool, int) { @@ -103,22 +92,21 @@ func (d *driver) peerDbAdd(nid, eid string, peerIP netip.Prefix, peerMac net.Har } d.peerDb.Unlock() - pKey := ipmacOf(peerIP.Addr(), peerMac) - pEntry := peerEntry{ - eid: eid, - vtep: vtep, - prefixBits: peerIP.Bits(), + eid: eid, + mac: macAddrOf(peerMac), + vtep: vtep, } pMap.Lock() defer pMap.Unlock() - b, i := pMap.mp.Insert(pKey, pEntry) + b, i := pMap.mp.Insert(peerIP, pEntry) if i != 1 { - // Transient case, there is more than one endpoint that is using the same IP,MAC pair - s, _ := pMap.mp.String(pKey) - log.G(context.TODO()).Warnf("peerDbAdd transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s) + // Transient case, there is more than one endpoint that is using the same IP + s, _ := pMap.mp.String(peerIP) + log.G(context.TODO()).Warnf("peerDbAdd transient condition - Key:%s cardinality:%d db state:%s", peerIP, i, s) } + return b, i } @@ -131,21 +119,19 @@ func (d *driver) peerDbDelete(nid, eid string, peerIP netip.Prefix, peerMac net. } d.peerDb.Unlock() - pKey := ipmacOf(peerIP.Addr(), peerMac) - pEntry := peerEntry{ - eid: eid, - vtep: vtep, - prefixBits: peerIP.Bits(), + eid: eid, + mac: macAddrOf(peerMac), + vtep: vtep, } pMap.Lock() defer pMap.Unlock() - b, i := pMap.mp.Remove(pKey, pEntry) + b, i := pMap.mp.Remove(peerIP, pEntry) if i != 0 { - // Transient case, there is more than one endpoint that is using the same IP,MAC pair - s, _ := pMap.mp.String(pKey) - log.G(context.TODO()).Warnf("peerDbDelete transient condition - Key:%s cardinality:%d db state:%s", pKey, i, s) + // Transient case, there is more than one endpoint that is using the same IP + s, _ := pMap.mp.String(peerIP) + log.G(context.TODO()).Warnf("peerDbDelete transient condition - Key:%s cardinality:%d db state:%s", peerIP, i, s) } return b, i } @@ -164,16 +150,12 @@ func (d *driver) peerDbDelete(nid, eid string, peerIP netip.Prefix, peerMac net. func (d *driver) initSandboxPeerDB(nid string) { d.peerOpMu.Lock() defer d.peerOpMu.Unlock() - err := d.peerDbNetworkWalk(nid, func(peerIP netip.Addr, peerMac net.HardwareAddr, pEntry *peerEntry) bool { + d.peerDbNetworkWalk(nid, func(peerIP netip.Prefix, pEntry peerEntry) bool { if !pEntry.isLocal() { - d.addNeighbor(nid, netip.PrefixFrom(peerIP, pEntry.prefixBits), peerMac, pEntry.vtep) + d.addNeighbor(nid, peerIP, pEntry.mac.HardwareAddr(), pEntry.vtep) } return false // walk all entries }) - - if err != nil { - log.G(context.TODO()).WithError(err).Warn("Peer init operation failed") - } } // peerAdd adds a new entry to the peer database. @@ -242,8 +224,10 @@ func (d *driver) addNeighbor(nid string, peerIP netip.Prefix, peerMac net.Hardwa } // Add fdb entry to the bridge for the peer mac - if err := sbox.AddNeighbor(vtep.AsSlice(), peerMac, osl.WithLinkName(s.vxlanName), osl.WithFamily(syscall.AF_BRIDGE)); err != nil { - return fmt.Errorf("could not add fdb entry into the sandbox: %w", err) + if n.fdbCnt.Add(ipmacOf(vtep, peerMac), 1) == 1 { + if err := sbox.AddNeighbor(vtep.AsSlice(), peerMac, osl.WithLinkName(s.vxlanName), osl.WithFamily(syscall.AF_BRIDGE)); err != nil { + return fmt.Errorf("could not add fdb entry into the sandbox: %w", err) + } } return nil @@ -287,23 +271,22 @@ func (d *driver) peerDelete(nid, eid string, peerIP netip.Prefix, peerMac net.Ha if dbEntries > 0 { // If there is still an entry into the database and the deletion went through without errors means that there is now no // configuration active in the kernel. - // Restore one configuration for the directly from the database, note that is guaranteed that there is one - peerIPAddr, peerMac, peerEntry, err := d.peerDbSearch(nid, peerIP.Addr()) - if err != nil { + // Restore one configuration for the ip directly from the database, note that is guaranteed that there is one + peerEntry, ok := d.peerDbGet(nid, peerIP) + if !ok { log.G(context.TODO()).WithFields(log.Fields{ "nid": nid, "ip": peerIP, - }).WithError(err).Error("peerDelete unable to restore a configuration") + }).Error("peerDelete unable to restore a configuration: no entry found in the database") return } - peerIP = netip.PrefixFrom(peerIPAddr, peerEntry.prefixBits) - err = d.addNeighbor(nid, peerIP, peerMac, peerEntry.vtep) + err = d.addNeighbor(nid, peerIP, peerEntry.mac.HardwareAddr(), peerEntry.vtep) if err != nil { log.G(context.TODO()).WithFields(log.Fields{ "nid": nid, "eid": eid, "ip": peerIP, - "mac": peerMac, + "mac": peerEntry.mac, "vtep": peerEntry.vtep, }).WithError(err).Error("Peer delete operation failed") } @@ -335,8 +318,10 @@ func (d *driver) deleteNeighbor(nid string, peerIP netip.Prefix, peerMac net.Har return fmt.Errorf("could not find the subnet %q in network %q", peerIP.String(), n.id) } // Remove fdb entry to the bridge for the peer mac - if err := sbox.DeleteNeighbor(vtep.AsSlice(), peerMac, osl.WithLinkName(s.vxlanName), osl.WithFamily(syscall.AF_BRIDGE)); err != nil { - return fmt.Errorf("could not delete fdb entry in the sandbox: %w", err) + if n.fdbCnt.Add(ipmacOf(vtep, peerMac), -1) == 0 { + if err := sbox.DeleteNeighbor(vtep.AsSlice(), peerMac, osl.WithLinkName(s.vxlanName), osl.WithFamily(syscall.AF_BRIDGE)); err != nil { + return fmt.Errorf("could not delete fdb entry in the sandbox: %w", err) + } } // Delete neighbor entry for the peer IP diff --git a/libnetwork/internal/countmap/countmap.go b/libnetwork/internal/countmap/countmap.go new file mode 100644 index 0000000000..a028ed9684 --- /dev/null +++ b/libnetwork/internal/countmap/countmap.go @@ -0,0 +1,19 @@ +// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16: +//go:build go1.23 + +package countmap + +// Map is a map of counters. +type Map[T comparable] map[T]int + +// Add adds delta to the counter for v and returns the new value. +// +// If the new value is 0, the entry is removed from the map. +func (m Map[T]) Add(v T, delta int) int { + m[v] += delta + c := m[v] + if c == 0 { + delete(m, v) + } + return c +} diff --git a/libnetwork/internal/countmap/countmap_test.go b/libnetwork/internal/countmap/countmap_test.go new file mode 100644 index 0000000000..ab33a3b0c7 --- /dev/null +++ b/libnetwork/internal/countmap/countmap_test.go @@ -0,0 +1,27 @@ +package countmap_test + +import ( + "testing" + + "github.com/docker/docker/libnetwork/internal/countmap" + "gotest.tools/v3/assert" + is "gotest.tools/v3/assert/cmp" +) + +func TestMap(t *testing.T) { + m := countmap.Map[string]{} + m["foo"] = 7 + m["bar"] = 2 + m["zeroed"] = -2 + + m.Add("bar", -3) + m.Add("foo", -8) + m.Add("baz", 1) + m.Add("zeroed", 2) + assert.Check(t, is.DeepEqual(m, countmap.Map[string]{"foo": -1, "bar": -1, "baz": 1})) + + m.Add("foo", 1) + m.Add("bar", 1) + m.Add("baz", -1) + assert.Check(t, is.DeepEqual(m, countmap.Map[string]{})) +} From 057e35dd655586bf5e928efa54fec9658f4ffbf5 Mon Sep 17 00:00:00 2001 From: Cory Snider Date: Wed, 21 May 2025 18:03:13 -0400 Subject: [PATCH 3/3] libnetwork/d/overlay: ref-count encryption params The IPsec encryption parameters (Security Association Database and Security Policy Database entries) for a particular overlay network peer (VTEP) are shared global state as they have to be programmed into the root network namespace. The same parameters are used when encrypting VXLAN traffic to a particular VTEP for all overlay networks. Deleting the entries for a VTEP will break encryption to that VTEP across all encrypted overlay networks, therefore the decision of when to delete the entries must take the state of all overlay networks into account. Unfortunately this is not the case. The overlay driver uses local per-network state to decide when to program and delete the parameters for a VTEP. In practice, the parameters for all VTEPs participating in an encrypted overlay network are deleted when the network is deleted. Encryption to that VTEP over all other active encrypted overlay networks would be broken until some other incidental peerDB event triggered a re-programming of the parameters for that VTEP. Change the setupEncryption and removeEncryption functions to be reference-counted. The removeEncryption function needs to be called the same number of times as addEncryption before the parameters are deleted from the kernel. Signed-off-by: Cory Snider --- libnetwork/drivers/overlay/encryption.go | 55 +++++++++++++++--------- libnetwork/drivers/overlay/overlay.go | 2 +- libnetwork/drivers/overlay/peerdb.go | 4 +- 3 files changed, 38 insertions(+), 23 deletions(-) diff --git a/libnetwork/drivers/overlay/encryption.go b/libnetwork/drivers/overlay/encryption.go index 19149b89aa..12268123fe 100644 --- a/libnetwork/drivers/overlay/encryption.go +++ b/libnetwork/drivers/overlay/encryption.go @@ -91,8 +91,13 @@ func (s *spi) String() string { return fmt.Sprintf("SPI(FWD: 0x%x, REV: 0x%x)", uint32(s.forward), uint32(s.reverse)) } +type encrNode struct { + spi []spi + count int +} + type encrMap struct { - nodes map[netip.Addr][]*spi + nodes map[netip.Addr]encrNode sync.Mutex } @@ -105,7 +110,7 @@ func (e *encrMap) String() string { b.WriteString(k.String()) b.WriteString(":") b.WriteString("[") - for _, s := range v { + for _, s := range v.spi { b.WriteString(s.String()) b.WriteString(",") } @@ -126,10 +131,10 @@ func (d *driver) setupEncryption(remoteIP netip.Addr) error { } log.G(context.TODO()).Debugf("Programming encryption between %s and %s", localIP, remoteIP) - indices := make([]*spi, 0, len(keys)) + indices := make([]spi, 0, len(keys)) for i, k := range keys { - spis := &spi{buildSPI(advIP.AsSlice(), remoteIP.AsSlice(), k.tag), buildSPI(remoteIP.AsSlice(), advIP.AsSlice(), k.tag)} + spis := spi{buildSPI(advIP.AsSlice(), remoteIP.AsSlice(), k.tag), buildSPI(remoteIP.AsSlice(), advIP.AsSlice(), k.tag)} dir := reverse if i == 0 { dir = bidir @@ -149,7 +154,10 @@ func (d *driver) setupEncryption(remoteIP netip.Addr) error { } d.secMap.Lock() - d.secMap.nodes[remoteIP] = indices + node := d.secMap.nodes[remoteIP] + node.spi = indices + node.count++ + d.secMap.nodes[remoteIP] = node d.secMap.Unlock() return nil @@ -158,13 +166,20 @@ func (d *driver) setupEncryption(remoteIP netip.Addr) error { func (d *driver) removeEncryption(remoteIP netip.Addr) error { log.G(context.TODO()).Debugf("removeEncryption(%s)", remoteIP) - d.secMap.Lock() - indices, ok := d.secMap.nodes[remoteIP] - d.secMap.Unlock() - if !ok { + spi := func() []spi { + d.secMap.Lock() + defer d.secMap.Unlock() + node := d.secMap.nodes[remoteIP] + if node.count == 1 { + delete(d.secMap.nodes, remoteIP) + return node.spi + } + node.count-- + d.secMap.nodes[remoteIP] = node return nil - } - for i, idxs := range indices { + }() + + for i, idxs := range spi { dir := reverse if i == 0 { dir = bidir @@ -263,7 +278,7 @@ func (d *driver) programInput(vni uint32, add bool) error { return nil } -func programSA(localIP, remoteIP net.IP, spi *spi, k *key, dir int, add bool) (fSA *netlink.XfrmState, rSA *netlink.XfrmState, lastErr error) { +func programSA(localIP, remoteIP net.IP, spi spi, k *key, dir int, add bool) (fSA *netlink.XfrmState, rSA *netlink.XfrmState, lastErr error) { var ( action = "Removing" xfrmProgram = ns.NlHandle().XfrmStateDel @@ -436,12 +451,12 @@ func buildAeadAlgo(k *key, s int) *netlink.XfrmStateAlgo { } } -func (d *driver) secMapWalk(f func(netip.Addr, []*spi) ([]*spi, bool)) error { +func (d *driver) secMapWalk(f func(netip.Addr, []spi) ([]spi, bool)) error { d.secMap.Lock() - for node, indices := range d.secMap.nodes { - idxs, stop := f(node, indices) + for rIP, node := range d.secMap.nodes { + idxs, stop := f(rIP, node.spi) if idxs != nil { - d.secMap.nodes[node] = idxs + d.secMap.nodes[rIP] = encrNode{idxs, node.count} } if stop { break @@ -457,7 +472,7 @@ func (d *driver) setKeys(keys []*key) error { // Accept the encryption keys and clear any stale encryption map d.Lock() d.keys = keys - d.secMap = &encrMap{nodes: map[netip.Addr][]*spi{}} + d.secMap = &encrMap{nodes: map[netip.Addr]encrNode{}} d.Unlock() log.G(context.TODO()).Debugf("Initial encryption keys: %v", keys) return nil @@ -506,7 +521,7 @@ func (d *driver) updateKeys(newKey, primary, pruneKey *key) error { return types.InvalidParameterErrorf("attempting to both make a key (index %d) primary and delete it", priIdx) } - d.secMapWalk(func(rIP netip.Addr, spis []*spi) ([]*spi, bool) { + d.secMapWalk(func(rIP netip.Addr, spis []spi) ([]spi, bool) { return updateNodeKey(lIP.AsSlice(), aIP.AsSlice(), rIP.AsSlice(), spis, d.keys, newIdx, priIdx, delIdx), false }) @@ -534,7 +549,7 @@ func (d *driver) updateKeys(newKey, primary, pruneKey *key) error { *********************************************************/ // Spis and keys are sorted in such away the one in position 0 is the primary -func updateNodeKey(lIP, aIP, rIP net.IP, idxs []*spi, curKeys []*key, newIdx, priIdx, delIdx int) []*spi { +func updateNodeKey(lIP, aIP, rIP net.IP, idxs []spi, curKeys []*key, newIdx, priIdx, delIdx int) []spi { log.G(context.TODO()).Debugf("Updating keys for node: %s (%d,%d,%d)", rIP, newIdx, priIdx, delIdx) spis := idxs @@ -542,7 +557,7 @@ func updateNodeKey(lIP, aIP, rIP net.IP, idxs []*spi, curKeys []*key, newIdx, pr // add new if newIdx != -1 { - spis = append(spis, &spi{ + spis = append(spis, spi{ forward: buildSPI(aIP, rIP, curKeys[newIdx].tag), reverse: buildSPI(rIP, aIP, curKeys[newIdx].tag), }) diff --git a/libnetwork/drivers/overlay/overlay.go b/libnetwork/drivers/overlay/overlay.go index e0063d76ac..9332e2709a 100644 --- a/libnetwork/drivers/overlay/overlay.go +++ b/libnetwork/drivers/overlay/overlay.go @@ -47,7 +47,7 @@ func Register(r driverapi.Registerer, config map[string]interface{}) error { peerDb: peerNetworkMap{ mp: map[string]*peerMap{}, }, - secMap: &encrMap{nodes: map[netip.Addr][]*spi{}}, + secMap: &encrMap{nodes: map[netip.Addr]encrNode{}}, config: config, } return r.RegisterDriver(NetworkType, d, driverapi.Capability{ diff --git a/libnetwork/drivers/overlay/peerdb.go b/libnetwork/drivers/overlay/peerdb.go index d655b0c4d7..b72ed2bb8c 100644 --- a/libnetwork/drivers/overlay/peerdb.go +++ b/libnetwork/drivers/overlay/peerdb.go @@ -212,7 +212,7 @@ func (d *driver) addNeighbor(nid string, peerIP netip.Prefix, peerMac net.Hardwa return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), err) } - if n.secure && len(n.endpoints) > 0 { + if n.secure { if err := d.setupEncryption(vtep); err != nil { log.G(context.TODO()).Warn(err) } @@ -307,7 +307,7 @@ func (d *driver) deleteNeighbor(nid string, peerIP netip.Prefix, peerMac net.Har return nil } - if n.secure && len(n.endpoints) == 0 { + if n.secure { if err := d.removeEncryption(vtep); err != nil { log.G(context.TODO()).Warn(err) }