diff --git a/libnetwork/internal/nftables/nftables_linux.go b/libnetwork/internal/nftables/nftables_linux.go index 62065f0c96..8bf8179cee 100644 --- a/libnetwork/internal/nftables/nftables_linux.go +++ b/libnetwork/internal/nftables/nftables_linux.go @@ -52,14 +52,20 @@ import ( "text/template" "github.com/containerd/log" + "go.opentelemetry.io/otel" ) +// Prefix for OTEL span names. +const spanPrefix = "libnetwork.internal.nftables" + var ( // nftPath is the path of the "nft" tool, set by [Enable] and left empty if the tool // is not present - in which case, nftables is disabled. nftPath string // incrementalUpdateTempl is a parsed text/template, used to apply incremental updates. incrementalUpdateTempl *template.Template + // reloadTempl is a parsed text/template, used to apply a whole table. + reloadTempl *template.Template // enableOnce is used by [Enable] to avoid checking the path for "nft" more than once. enableOnce sync.Once ) @@ -252,15 +258,55 @@ table {{$family}} {{$tableName}} { {{end}}{{end}} ` +// reloadTemplText is used with text/template to generate an nftables command file +// (which will be applied atomically), to fully re-create a table. +// +// It first declares the table so if it doesn't already exist, it can be deleted. +// Then it deletes the table and re-creates it. +const reloadTemplText = `{{$family := .Family}}{{$tableName := .Name}} +table {{$family}} {{$tableName}} {} +delete table {{$family}} {{$tableName}} +table {{$family}} {{$tableName}} { + {{range .VMaps}}map {{.Name}} { + type {{.ElementType}} : verdict + {{if len .Flags}}flags{{range .Flags}} {{.}}{{end}}{{end}} + {{if .Elements}}elements = { + {{range $k,$v := .Elements}}{{$k}} : {{$v}}, + {{end -}} + }{{end}} + } + {{end}} + {{range .Sets}}set {{.Name}} { + type {{.ElementType}} + {{if len .Flags}}flags{{range .Flags}} {{.}}{{end}}{{end}} + {{if .Elements}}elements = { + {{range $k,$v := .Elements}}{{$k}}, + {{end -}} + }{{end}} + } + {{end}} + {{range .Chains}}chain {{.Name}} { + {{if .ChainType}}type {{.ChainType}} hook {{.Hook}} priority {{.Priority}}; policy {{.Policy}}{{end}} + {{range .Rules}}{{.}} + {{end}} + } + {{end}} +} +` + // Apply makes incremental updates to nftables, corresponding to changes to the [TableRef] // since Apply was last called. func (t TableRef) Apply(ctx context.Context) error { - var buf bytes.Buffer + if !Enabled() { + return errors.New("nftables is not enabled") + } // Update nftables. + var buf bytes.Buffer if err := incrementalUpdateTempl.Execute(&buf, t.t); err != nil { return fmt.Errorf("failed to execute template nft ruleset: %w", err) } + if err := nftApply(ctx, buf.Bytes()); err != nil { // On error, log a line-numbered version of the generated "nft" input (because // nft error messages refer to line numbers). @@ -271,25 +317,51 @@ func (t TableRef) Apply(ctx context.Context) error { sb.Write(line) } log.G(ctx).Error("nftables: failed to update nftables:\n", sb.String(), "\n", err) + + // It's possible something destructive has happened to nftables. For example, in + // integration-cli tests, tests start daemons in the same netns as the integration + // test's own daemon. They don't always use their own daemon, but they tend to leave + // behind networks for the test infrastructure to clean up between tests. Starting + // a daemon flushes the "docker-bridges" table, so the cleanup fails to delete a + // rule that's been flushed. So, try reloading the whole table to get back in-sync. + return t.Reload(ctx) + } + + // Note that updates have been applied. + t.t.updatesApplied() + return nil +} + +// Reload deletes the table, then re-creates it, atomically. +func (t TableRef) Reload(ctx context.Context) error { + if !Enabled() { + return errors.New("nftables is not enabled") + } + + ctx = log.WithLogger(ctx, log.G(ctx).WithFields(log.Fields{"table": t.t.Name, "family": t.t.Family})) + log.G(ctx).Warn("nftables: reloading table") + + // Build the update. + var buf bytes.Buffer + if err := reloadTempl.Execute(&buf, t.t); err != nil { + return fmt.Errorf("failed to execute reload template: %w", err) + } + + if err := nftApply(ctx, buf.Bytes()); err != nil { + // On error, log a line-numbered version of the generated "nft" input (because + // nft error messages refer to line numbers). + var sb strings.Builder + for i, line := range bytes.SplitAfter(buf.Bytes(), []byte("\n")) { + sb.WriteString(strconv.Itoa(i + 1)) + sb.WriteString(":\t") + sb.Write(line) + } + log.G(ctx).Error("nftables: failed to reload nftable:\n", sb.String(), "\n", err) return err } // Note that updates have been applied. - t.t.DeleteChainCommands = t.t.DeleteChainCommands[:0] - for _, c := range t.t.Chains { - c.Dirty = false - } - for _, m := range t.t.VMaps { - m.Dirty = false - m.AddedElements = map[string]string{} - m.DeletedElements = map[string]struct{}{} - } - for _, s := range t.t.Sets { - s.Dirty = false - s.AddedElements = map[string]struct{}{} - s.DeletedElements = map[string]struct{}{} - } - t.t.Dirty = false + t.t.updatesApplied() return nil } @@ -650,6 +722,24 @@ func (s SetRef) DeleteElement(element string) error { // //////////////////////////// // Internal +func (t *table) updatesApplied() { + t.DeleteChainCommands = t.DeleteChainCommands[:0] + for _, c := range t.Chains { + c.Dirty = false + } + for _, m := range t.VMaps { + m.Dirty = false + m.AddedElements = map[string]string{} + m.DeletedElements = map[string]struct{}{} + } + for _, s := range t.Sets { + s.Dirty = false + s.AddedElements = map[string]struct{}{} + s.DeletedElements = map[string]struct{}{} + } + t.Dirty = false +} + /* Can't make text/template range over this, not sure why ... func (c *chain) Rules() iter.Seq[string] { groups := make([]int, 0, len(c.ruleGroups)) @@ -691,11 +781,18 @@ func parseTemplate() error { if err != nil { return fmt.Errorf("parsing 'incrementalUpdateTemplText': %w", err) } + reloadTempl, err = template.New("ruleset").Parse(reloadTemplText) + if err != nil { + return fmt.Errorf("parsing 'reloadTemplText': %w", err) + } return nil } // nftApply runs the "nft" command. func nftApply(ctx context.Context, nftCmd []byte) error { + ctx, span := otel.Tracer("").Start(ctx, spanPrefix+".nftApply") + defer span.End() + if !Enabled() { return errors.New("nftables is not enabled") } diff --git a/libnetwork/internal/nftables/nftables_linux_test.go b/libnetwork/internal/nftables/nftables_linux_test.go index af694b25cc..b73afd2e9c 100644 --- a/libnetwork/internal/nftables/nftables_linux_test.go +++ b/libnetwork/internal/nftables/nftables_linux_test.go @@ -36,6 +36,7 @@ func testSetup(t *testing.T) func() { func disable() { incrementalUpdateTempl = nil nftPath = "" + reloadTempl = nil enableOnce = sync.Once{} } @@ -44,7 +45,7 @@ func applyAndCheck(t *testing.T, tbl TableRef, goldenFilename string) { err := tbl.Apply(context.Background()) assert.Check(t, err) res := icmd.RunCommand("nft", "list", "ruleset") - assert.Check(t, is.Equal(res.ExitCode, 0)) + res.Assert(t, icmd.Success) golden.Assert(t, res.Combined(), goldenFilename) } @@ -250,3 +251,49 @@ func TestSet(t *testing.T) { applyAndCheck(t, tbl4, t.Name()+"_deleted4.golden") applyAndCheck(t, tbl6, t.Name()+"_deleted46.golden") } + +func TestReload(t *testing.T) { + defer testSetup(t)() + + // Create a table with some stuff in it. + const tableName = "this_is_a_table" + tbl, err := NewTable(IPv4, tableName) + assert.NilError(t, err) + bc, err := tbl.BaseChain("a_base_chain", BaseChainTypeFilter, BaseChainHookForward, BaseChainPriorityFilter) + assert.NilError(t, err) + err = bc.AppendRule(0, "counter") + assert.NilError(t, err) + m := tbl.InterfaceVMap("this_is_a_vmap") + err = m.AddElement("eth0", "return") + assert.Check(t, err) + err = m.AddElement("eth1", "return") + assert.Check(t, err) + err = tbl.PrefixSet("set4").AddElement("192.0.2.0/24") + assert.Check(t, err) + applyAndCheck(t, tbl, t.Name()+"_created.golden") + + // Delete the underlying nftables table. + deleteTable := func() { + t.Helper() + res := icmd.RunCommand("nft", "delete", "table", string(IPv4), tableName) + res.Assert(t, icmd.Success) + res = icmd.RunCommand("nft", "list", "ruleset") + res.Assert(t, icmd.Success) + assert.Check(t, is.Equal(res.Combined(), "")) + } + deleteTable() + + // Reconstruct the nftables table. + err = tbl.Reload(context.Background()) + assert.Check(t, err) + applyAndCheck(t, tbl, t.Name()+"_reloaded.golden") + + // Delete again. + deleteTable() + + // Check implicit/recovery reload - only deleting something that's gone missing + // from a vmap/set will trigger this. + err = m.DeleteElement("eth1") + assert.Check(t, err) + applyAndCheck(t, tbl, t.Name()+"_recovered.golden") +} diff --git a/libnetwork/internal/nftables/testdata/TestReload_created.golden b/libnetwork/internal/nftables/testdata/TestReload_created.golden new file mode 100644 index 0000000000..f69a290c70 --- /dev/null +++ b/libnetwork/internal/nftables/testdata/TestReload_created.golden @@ -0,0 +1,18 @@ +table ip this_is_a_table { + map this_is_a_vmap { + type ifname : verdict + elements = { "eth0" : return, + "eth1" : return } + } + + set set4 { + type ipv4_addr + flags interval + elements = { 192.0.2.0/24 } + } + + chain a_base_chain { + type filter hook forward priority filter; policy accept; + counter packets 0 bytes 0 + } +} diff --git a/libnetwork/internal/nftables/testdata/TestReload_recovered.golden b/libnetwork/internal/nftables/testdata/TestReload_recovered.golden new file mode 100644 index 0000000000..98d7036061 --- /dev/null +++ b/libnetwork/internal/nftables/testdata/TestReload_recovered.golden @@ -0,0 +1,17 @@ +table ip this_is_a_table { + map this_is_a_vmap { + type ifname : verdict + elements = { "eth0" : return } + } + + set set4 { + type ipv4_addr + flags interval + elements = { 192.0.2.0/24 } + } + + chain a_base_chain { + type filter hook forward priority filter; policy accept; + counter packets 0 bytes 0 + } +} diff --git a/libnetwork/internal/nftables/testdata/TestReload_reloaded.golden b/libnetwork/internal/nftables/testdata/TestReload_reloaded.golden new file mode 100644 index 0000000000..f69a290c70 --- /dev/null +++ b/libnetwork/internal/nftables/testdata/TestReload_reloaded.golden @@ -0,0 +1,18 @@ +table ip this_is_a_table { + map this_is_a_vmap { + type ifname : verdict + elements = { "eth0" : return, + "eth1" : return } + } + + set set4 { + type ipv4_addr + flags interval + elements = { 192.0.2.0/24 } + } + + chain a_base_chain { + type filter hook forward priority filter; policy accept; + counter packets 0 bytes 0 + } +} diff --git a/libnetwork/internal/nftables/testdata/TestTable_created.golden b/libnetwork/internal/nftables/testdata/TestTable_created.golden deleted file mode 100644 index 3bb18c8761..0000000000 --- a/libnetwork/internal/nftables/testdata/TestTable_created.golden +++ /dev/null @@ -1,4 +0,0 @@ -table ip ipv4_table { -} -table ip6 ipv6_table { -}