Compare commits

...

2 Commits

Author SHA1 Message Date
Sebastiaan van Stijn
d05a725121 Merge pull request #51843 from corhere/backport-29.x/fix-service-binding-soft-disable
[docker-29.x backport] libnetwork: fix graceful service endpoint removal
2026-01-14 23:46:09 +01:00
Cory Snider
01fc4fcf8b libnetwork: fix graceful service endpoint removal
When a container is stopped, new connections to a published port will
not be routed to that container but any existing connections are
permitted to continue uninterrupted for the duration of the container's
grace period. Unfortunately recent fixes to overlay networks introduced
a regression: existing connections routed over the service mesh to
containers on remote nodes are dropped immediately when the container is
stopped, irrespective of the grace period.

Fix the handling of NetworkDB endpoint table events so that the endpoint
is disabled in the load balancer when a service endpoint transitions to
ServiceDisabled instead of deleting the endpoint and re-adding it. And
fix the other bugged state transitions with the help of a unit test
which exhaustively covers all permutations of endpoint event.

Signed-off-by: Cory Snider <csnider@mirantis.com>
(cherry picked from commit 9ec65542a0)
Signed-off-by: Cory Snider <csnider@mirantis.com>
2026-01-13 18:56:23 -05:00
2 changed files with 359 additions and 21 deletions

View File

@@ -362,7 +362,7 @@ func (c *Controller) agentInit(listenAddr, bindAddrOrInterface, advertiseAddr, d
}
c.mu.Unlock()
go c.handleTableEvents(ch, c.handleEpTableEvent)
go c.handleTableEvents(ch, func(ev events.Event) { handleEpTableEvent(c, ev) })
go c.handleTableEvents(nodeCh, c.handleNodeTableEvent)
keys, tags := c.getKeys(subsysIPSec)
@@ -894,14 +894,17 @@ func unmarshalEndpointRecord(data []byte) (*endpointEvent, error) {
}, nil
}
// EquivalentTo returns true if ev is semantically equivalent to other.
// EquivalentTo returns true if ev is semantically equivalent to other,
// ignoring the ServiceDisabled field.
func (ev *endpointEvent) EquivalentTo(other *endpointEvent) bool {
if ev == nil || other == nil {
return (ev == nil) == (other == nil)
}
return ev.Name == other.Name &&
ev.ServiceName == other.ServiceName &&
ev.ServiceID == other.ServiceID &&
ev.VirtualIP == other.VirtualIP &&
ev.EndpointIP == other.EndpointIP &&
ev.ServiceDisabled == other.ServiceDisabled &&
iterutil.SameValues(
iterutil.Deref(slices.Values(ev.IngressPorts)),
iterutil.Deref(slices.Values(other.IngressPorts))) &&
@@ -909,7 +912,14 @@ func (ev *endpointEvent) EquivalentTo(other *endpointEvent) bool {
iterutil.SameValues(slices.Values(ev.TaskAliases), slices.Values(other.TaskAliases))
}
func (c *Controller) handleEpTableEvent(ev events.Event) {
type serviceBinder interface {
addContainerNameResolution(nID, eID, containerName string, taskAliases []string, ip net.IP, method string) error
delContainerNameResolution(nID, eID, containerName string, taskAliases []string, ip net.IP, method string) error
addServiceBinding(svcName, svcID, nID, eID, containerName string, vip net.IP, ingressPorts []*PortConfig, serviceAliases, taskAliases []string, ip net.IP, method string) error
rmServiceBinding(svcName, svcID, nID, eID, containerName string, vip net.IP, ingressPorts []*PortConfig, serviceAliases []string, taskAliases []string, ip net.IP, method string, deleteSvcRecords bool, fullRemove bool) error
}
func handleEpTableEvent(c serviceBinder, ev events.Event) {
event := ev.(networkdb.WatchEvent)
nid := event.NetworkID
eid := event.Key
@@ -939,8 +949,9 @@ func (c *Controller) handleEpTableEvent(ev events.Event) {
})
logger.Debug("handleEpTableEvent")
equivalent := prev.EquivalentTo(epRec)
if prev != nil {
if epRec != nil && prev.EquivalentTo(epRec) {
if equivalent && prev.ServiceDisabled == epRec.ServiceDisabled {
// Avoid flapping if we would otherwise remove a service
// binding then immediately replace it with an equivalent one.
return
@@ -948,7 +959,11 @@ func (c *Controller) handleEpTableEvent(ev events.Event) {
if prev.ServiceID != "" {
// This is a remote task part of a service
if !prev.ServiceDisabled {
if epRec == nil || !equivalent {
// Either the endpoint is deleted from NetworkDB or has
// been replaced with a different one. Remove the old
// binding. The new binding, if any, will be added
// below.
err := c.rmServiceBinding(prev.ServiceName, prev.ServiceID, nid, eid,
prev.Name, prev.VirtualIP.AsSlice(), prev.IngressPorts,
prev.Aliases, prev.TaskAliases, prev.EndpointIP.AsSlice(),
@@ -957,7 +972,7 @@ func (c *Controller) handleEpTableEvent(ev events.Event) {
logger.WithError(err).Error("failed removing service binding")
}
}
} else {
} else if !prev.ServiceDisabled && (!equivalent || epRec == nil || epRec.ServiceDisabled) {
// This is a remote container simply attached to an attachable network
err := c.delContainerNameResolution(nid, eid, prev.Name, prev.TaskAliases,
prev.EndpointIP.AsSlice(), "handleEpTableEvent")
@@ -970,19 +985,16 @@ func (c *Controller) handleEpTableEvent(ev events.Event) {
if epRec != nil {
if epRec.ServiceID != "" {
// This is a remote task part of a service
if epRec.ServiceDisabled {
// Don't double-remove a service binding
if prev == nil || prev.ServiceID != epRec.ServiceID || !prev.ServiceDisabled {
err := c.rmServiceBinding(epRec.ServiceName, epRec.ServiceID,
nid, eid, epRec.Name, epRec.VirtualIP.AsSlice(),
epRec.IngressPorts, epRec.Aliases, epRec.TaskAliases,
epRec.EndpointIP.AsSlice(), "handleEpTableEvent", true, false)
if err != nil {
logger.WithError(err).Error("failed disabling service binding")
return
}
if equivalent && !prev.ServiceDisabled && epRec.ServiceDisabled {
err := c.rmServiceBinding(epRec.ServiceName, epRec.ServiceID,
nid, eid, epRec.Name, epRec.VirtualIP.AsSlice(),
epRec.IngressPorts, epRec.Aliases, epRec.TaskAliases,
epRec.EndpointIP.AsSlice(), "handleEpTableEvent", true, false)
if err != nil {
logger.WithError(err).Error("failed disabling service binding")
return
}
} else {
} else if !epRec.ServiceDisabled {
err := c.addServiceBinding(epRec.ServiceName, epRec.ServiceID, nid, eid,
epRec.Name, epRec.VirtualIP.AsSlice(), epRec.IngressPorts,
epRec.Aliases, epRec.TaskAliases, epRec.EndpointIP.AsSlice(),
@@ -992,7 +1004,7 @@ func (c *Controller) handleEpTableEvent(ev events.Event) {
return
}
}
} else {
} else if !epRec.ServiceDisabled && (!equivalent || prev == nil || prev.ServiceDisabled) {
// This is a remote container simply attached to an attachable network
err := c.addContainerNameResolution(nid, eid, epRec.Name, epRec.TaskAliases,
epRec.EndpointIP.AsSlice(), "handleEpTableEvent")

View File

@@ -1,11 +1,16 @@
package libnetwork
import (
"fmt"
"net"
"net/netip"
"slices"
"testing"
"github.com/gogo/protobuf/proto"
"gotest.tools/v3/assert"
"github.com/moby/moby/v2/daemon/libnetwork/networkdb"
)
func TestEndpointEvent_EquivalentTo(t *testing.T) {
@@ -40,9 +45,12 @@ func TestEndpointEvent_EquivalentTo(t *testing.T) {
return a.EquivalentTo(b)
}
assert.Check(t, reflexiveEquiv(nil, nil), "nil should be equivalent to nil")
assert.Check(t, !reflexiveEquiv(&a, nil), "non-nil should not be equivalent to nil")
b := a
b.ServiceDisabled = true
assert.Check(t, !reflexiveEquiv(&a, &b), "differing by ServiceDisabled")
assert.Check(t, reflexiveEquiv(&a, &b), "ServiceDisabled value should not matter")
c := a
c.IngressPorts = slices.Clone(a.IngressPorts)
@@ -88,3 +96,321 @@ func TestEndpointEvent_EquivalentTo(t *testing.T) {
l.Name = "aaaaa"
assert.Check(t, !reflexiveEquiv(&a, &l), "Differing Name should not be equivalent")
}
type mockServiceBinder struct {
actions []string
}
func (m *mockServiceBinder) addContainerNameResolution(nID, eID, containerName string, _ []string, ip net.IP, _ string) error {
m.actions = append(m.actions, fmt.Sprintf("addContainerNameResolution(%v, %v, %v, %v)", nID, eID, containerName, ip))
return nil
}
func (m *mockServiceBinder) delContainerNameResolution(nID, eID, containerName string, _ []string, ip net.IP, _ string) error {
m.actions = append(m.actions, fmt.Sprintf("delContainerNameResolution(%v, %v, %v, %v)", nID, eID, containerName, ip))
return nil
}
func (m *mockServiceBinder) addServiceBinding(svcName, svcID, nID, eID, containerName string, vip net.IP, _ []*PortConfig, _, _ []string, ip net.IP, _ string) error {
m.actions = append(m.actions, fmt.Sprintf("addServiceBinding(%v, %v, %v, %v, %v, %v, %v)", svcName, svcID, nID, eID, containerName, vip, ip))
return nil
}
func (m *mockServiceBinder) rmServiceBinding(svcName, svcID, nID, eID, containerName string, vip net.IP, _ []*PortConfig, _, _ []string, ip net.IP, _ string, deleteSvcRecords bool, fullRemove bool) error {
m.actions = append(m.actions, fmt.Sprintf("rmServiceBinding(%v, %v, %v, %v, %v, %v, %v, deleteSvcRecords=%v, fullRemove=%v)", svcName, svcID, nID, eID, containerName, vip, ip, deleteSvcRecords, fullRemove))
return nil
}
func TestHandleEPTableEvent(t *testing.T) {
svc1 := EndpointRecord{
Name: "ep1",
ServiceName: "svc1",
ServiceID: "id1",
VirtualIP: "10.0.0.1",
EndpointIP: "192.168.12.42",
}
svc1disabled := svc1
svc1disabled.ServiceDisabled = true
svc2 := EndpointRecord{
Name: "ep2",
ServiceName: "svc2",
ServiceID: "id2",
VirtualIP: "10.0.0.2",
EndpointIP: "172.16.69.5",
}
svc2disabled := svc2
svc2disabled.ServiceDisabled = true
ctr1 := EndpointRecord{
Name: "ctr1",
EndpointIP: "172.18.1.1",
}
ctr1disabled := ctr1
ctr1disabled.ServiceDisabled = true
ctr2 := EndpointRecord{
Name: "ctr2",
EndpointIP: "172.18.1.2",
}
ctr2disabled := ctr2
ctr2disabled.ServiceDisabled = true
tests := []struct {
name string
prev, ev *EndpointRecord
expectedActions []string
}{
{
name: "Insert/Service/ServiceDisabled=false",
ev: &svc1,
expectedActions: []string{
"addServiceBinding(svc1, id1, network1, endpoint1, ep1, 10.0.0.1, 192.168.12.42)",
},
},
{
name: "Insert/Service/ServiceDisabled=true",
ev: &svc1disabled,
},
{
name: "Insert/Container/ServiceDisabled=false",
ev: &ctr1,
expectedActions: []string{
"addContainerNameResolution(network1, endpoint1, ctr1, 172.18.1.1)",
},
},
{
name: "Insert/Container/ServiceDisabled=true",
ev: &ctr1disabled,
},
{
name: "Update/Service/ServiceDisabled=ft",
prev: &svc1,
ev: &svc1disabled,
expectedActions: []string{
"rmServiceBinding(svc1, id1, network1, endpoint1, ep1, 10.0.0.1, 192.168.12.42, deleteSvcRecords=true, fullRemove=false)",
},
},
{
name: "Update/Service/ServiceDisabled=tf",
prev: &svc1disabled,
ev: &svc1,
expectedActions: []string{
"addServiceBinding(svc1, id1, network1, endpoint1, ep1, 10.0.0.1, 192.168.12.42)",
},
},
{
name: "Update/Service/ServiceDisabled=ff",
prev: &svc1disabled,
ev: &svc1disabled,
},
{
name: "Update/Service/ServiceDisabled=tt",
prev: &svc1,
ev: &svc1,
},
{
name: "Update/Container/ServiceDisabled=ft",
prev: &ctr1,
ev: &ctr1disabled,
expectedActions: []string{
"delContainerNameResolution(network1, endpoint1, ctr1, 172.18.1.1)",
},
},
{
name: "Update/Container/ServiceDisabled=tf",
prev: &ctr1disabled,
ev: &ctr1,
expectedActions: []string{
"addContainerNameResolution(network1, endpoint1, ctr1, 172.18.1.1)",
},
},
{
name: "Update/Container/ServiceDisabled=ff",
prev: &ctr1disabled,
ev: &ctr1disabled,
},
{
name: "Update/Container/ServiceDisabled=tt",
prev: &ctr1,
ev: &ctr1,
},
{
name: "Delete/Service/ServiceDisabled=false",
prev: &svc1,
expectedActions: []string{
"rmServiceBinding(svc1, id1, network1, endpoint1, ep1, 10.0.0.1, 192.168.12.42, deleteSvcRecords=true, fullRemove=true)",
},
},
{
name: "Delete/Service/ServiceDisabled=true",
prev: &svc1disabled,
expectedActions: []string{
"rmServiceBinding(svc1, id1, network1, endpoint1, ep1, 10.0.0.1, 192.168.12.42, deleteSvcRecords=true, fullRemove=true)",
},
},
{
name: "Delete/Container/ServiceDisabled=false",
prev: &ctr1,
expectedActions: []string{
"delContainerNameResolution(network1, endpoint1, ctr1, 172.18.1.1)",
},
},
{
name: "Delete/Container/ServiceDisabled=true",
prev: &ctr1disabled,
},
{
name: "Replace/From=Service/To=Service/ServiceDisabled=ff",
prev: &svc1,
ev: &svc2,
expectedActions: []string{
"rmServiceBinding(svc1, id1, network1, endpoint1, ep1, 10.0.0.1, 192.168.12.42, deleteSvcRecords=true, fullRemove=true)",
"addServiceBinding(svc2, id2, network1, endpoint1, ep2, 10.0.0.2, 172.16.69.5)",
},
},
{
name: "Replace/From=Service/To=Service/ServiceDisabled=ft",
prev: &svc1,
ev: &svc2disabled,
expectedActions: []string{
"rmServiceBinding(svc1, id1, network1, endpoint1, ep1, 10.0.0.1, 192.168.12.42, deleteSvcRecords=true, fullRemove=true)",
},
},
{
name: "Replace/From=Service/To=Service/ServiceDisabled=tf",
prev: &svc1disabled,
ev: &svc2,
expectedActions: []string{
"rmServiceBinding(svc1, id1, network1, endpoint1, ep1, 10.0.0.1, 192.168.12.42, deleteSvcRecords=true, fullRemove=true)",
"addServiceBinding(svc2, id2, network1, endpoint1, ep2, 10.0.0.2, 172.16.69.5)",
},
},
{
name: "Replace/From=Service/To=Service/ServiceDisabled=tt",
prev: &svc1disabled,
ev: &svc2disabled,
expectedActions: []string{
"rmServiceBinding(svc1, id1, network1, endpoint1, ep1, 10.0.0.1, 192.168.12.42, deleteSvcRecords=true, fullRemove=true)",
},
},
{
name: "Replace/From=Service/To=Container/ServiceDisabled=ff",
prev: &svc1,
ev: &ctr2,
expectedActions: []string{
"rmServiceBinding(svc1, id1, network1, endpoint1, ep1, 10.0.0.1, 192.168.12.42, deleteSvcRecords=true, fullRemove=true)",
"addContainerNameResolution(network1, endpoint1, ctr2, 172.18.1.2)",
},
},
{
name: "Replace/From=Service/To=Container/ServiceDisabled=ft",
prev: &svc1,
ev: &ctr2disabled,
expectedActions: []string{
"rmServiceBinding(svc1, id1, network1, endpoint1, ep1, 10.0.0.1, 192.168.12.42, deleteSvcRecords=true, fullRemove=true)",
},
},
{
name: "Replace/From=Service/To=Container/ServiceDisabled=tf",
prev: &svc1disabled,
ev: &ctr2,
expectedActions: []string{
"rmServiceBinding(svc1, id1, network1, endpoint1, ep1, 10.0.0.1, 192.168.12.42, deleteSvcRecords=true, fullRemove=true)",
"addContainerNameResolution(network1, endpoint1, ctr2, 172.18.1.2)",
},
},
{
name: "Replace/From=Service/To=Container/ServiceDisabled=tt",
prev: &svc1disabled,
ev: &ctr2disabled,
expectedActions: []string{
"rmServiceBinding(svc1, id1, network1, endpoint1, ep1, 10.0.0.1, 192.168.12.42, deleteSvcRecords=true, fullRemove=true)",
},
},
{
name: "Replace/From=Container/To=Service/ServiceDisabled=ff",
prev: &ctr1,
ev: &svc2,
expectedActions: []string{
"delContainerNameResolution(network1, endpoint1, ctr1, 172.18.1.1)",
"addServiceBinding(svc2, id2, network1, endpoint1, ep2, 10.0.0.2, 172.16.69.5)",
},
},
{
name: "Replace/From=Container/To=Service/ServiceDisabled=ft",
prev: &ctr1,
ev: &svc2disabled,
expectedActions: []string{
"delContainerNameResolution(network1, endpoint1, ctr1, 172.18.1.1)",
},
},
{
name: "Replace/From=Container/To=Service/ServiceDisabled=tf",
prev: &ctr1disabled,
ev: &svc2,
expectedActions: []string{
"addServiceBinding(svc2, id2, network1, endpoint1, ep2, 10.0.0.2, 172.16.69.5)",
},
},
{
name: "Replace/From=Container/To=Service/ServiceDisabled=tt",
prev: &ctr1disabled,
ev: &svc2disabled,
},
{
name: "Replace/From=Container/To=Container/ServiceDisabled=ff",
prev: &ctr1,
ev: &ctr2,
expectedActions: []string{
"delContainerNameResolution(network1, endpoint1, ctr1, 172.18.1.1)",
"addContainerNameResolution(network1, endpoint1, ctr2, 172.18.1.2)",
},
},
{
name: "Replace/From=Container/To=Container/ServiceDisabled=ft",
prev: &ctr1,
ev: &ctr2disabled,
expectedActions: []string{
"delContainerNameResolution(network1, endpoint1, ctr1, 172.18.1.1)",
},
},
{
name: "Replace/From=Container/To=Container/ServiceDisabled=tf",
prev: &ctr1disabled,
ev: &ctr2,
expectedActions: []string{
"addContainerNameResolution(network1, endpoint1, ctr2, 172.18.1.2)",
},
},
{
name: "Replace/From=Container/To=Container/ServiceDisabled=tt",
prev: &ctr1disabled,
ev: &ctr2disabled,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
msb := &mockServiceBinder{}
event := networkdb.WatchEvent{
NetworkID: "network1",
Key: "endpoint1",
}
var err error
if tt.prev != nil {
event.Prev, err = proto.Marshal(tt.prev)
assert.NilError(t, err)
}
if tt.ev != nil {
event.Value, err = proto.Marshal(tt.ev)
assert.NilError(t, err)
}
handleEpTableEvent(msb, event)
assert.DeepEqual(t, tt.expectedActions, msb.actions)
})
}
}