From a7dc7bf1f460e8635dc3a89fb3d56db09207dfa8 Mon Sep 17 00:00:00 2001 From: Donavan Fritz Date: Sat, 25 Apr 2026 09:57:32 -0500 Subject: [PATCH] anycast: kernel multipath route + L4 hash for multi-pod-per-node MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move pure resolver logic out of anycast_linux.go into anycast.go so it's unit-testable on any host. Reshape anycastTarget from a single {hostIface, via} into a sorted list of nexthops; multiple Ready pods on the same node binding the same anycast IP now contribute one nexthop each. installAnycastRoute uses RTA_MULTIPATH (via netlink.Route.MultiPath) when the target has more than one nexthop. Single-nexthop targets keep the simple via-route shape so 1-pod-per-node keeps rendering identically to today's production form in `ip route show`. flock-agent writes net.ipv{4,6}.fib_multipath_hash_policy = 1 at startup so the kernel hashes flows on (saddr, daddr, sport, dport, proto) rather than just IPs. Best-effort — runs privileged in production, so it works; falls back to L3 hash on environments where the write fails (only matters for the multi-pod-per-node case anyway). resolveAnycastTargets sorts nexthops by canonical(via) for stable comparison so a quiet reconcile pass doesn't churn the kernel route. 8 new unit tests cover: 1-pod, 2-pods-same-anycast (multi-nexthop), NotReady drop, no-Ready omits the IP, pending skipped, mixed v6+v4, family mismatch warns, determinism. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- pkg/agent/anycast.go | 110 ++++++++++++++++++ pkg/agent/anycast_linux.go | 145 ++++++++++++----------- pkg/agent/anycast_test.go | 227 +++++++++++++++++++++++++++++++++++++ pkg/agent/runtime_linux.go | 27 +++++ 4 files changed, 436 insertions(+), 73 deletions(-) create mode 100644 pkg/agent/anycast.go create mode 100644 pkg/agent/anycast_test.go diff --git a/pkg/agent/anycast.go b/pkg/agent/anycast.go new file mode 100644 index 0000000..759d761 --- /dev/null +++ b/pkg/agent/anycast.go @@ -0,0 +1,110 @@ +package agent + +import ( + "net" + "sort" +) + +// anycastNexthop is one (host-side veth, pod-eth0-IP) pair the kernel route +// can use as a multipath nexthop. +type anycastNexthop struct { + hostIface string + via net.IP +} + +// anycastTarget describes the kernel route shape for one advertised anycast +// IP. When more than one Ready pod on this node binds the same anycast IP, +// every Ready pod contributes a nexthop and the kernel does per-flow ECMP +// across them. +// +// nexthops is sorted by canonical(via) for deterministic comparison and +// stable kernel-route ordering across reconcile passes — the +// AnycastReconciler skips kernel writes when the new and old targets are +// equal, which only works if the slice order is stable. +type anycastTarget struct { + nexthops []anycastNexthop +} + +// equal reports whether two targets describe the same kernel route. +// Both sides are expected to be sorted (the canonical constructor sorts). +func (t anycastTarget) equal(o anycastTarget) bool { + if len(t.nexthops) != len(o.nexthops) { + return false + } + for i := range t.nexthops { + if t.nexthops[i].hostIface != o.nexthops[i].hostIface { + return false + } + if !t.nexthops[i].via.Equal(o.nexthops[i].via) { + return false + } + } + return true +} + +// resolveAnycastTargets walks the committed allocation set and returns the +// desired kernel-route shape for every anycast IP that has at least one +// Ready local pod binding it. Multiple Ready pods sharing the same anycast +// IP collapse into a single multi-nexthop target so the kernel can +// per-flow ECMP across them. +// +// Pure: no kernel calls, no informer access. Pods are surfaced via the +// isReady callback so the reconciler can plug in its informer; tests can +// pass any function that satisfies the signature. +// +// warn is invoked for human-facing skip reasons (e.g. anycast with no +// unicast of same family). nil-safe — pass nil to silently drop. +func resolveAnycastTargets( + allocations []Allocation, + isReady func(namespace, name string) bool, + warn func(string), +) map[string]anycastTarget { + if warn == nil { + warn = func(string) {} + } + out := map[string]anycastTarget{} + for _, a := range allocations { + if a.State != StateCommitted || len(a.Anycast) == 0 { + continue + } + if !isReady(a.Namespace, a.PodName) { + continue + } + host := HostIfaceName(a.ContainerID) + via6 := net.ParseIP(a.IP6) + via4 := net.ParseIP(a.IP4) + for _, ipStr := range a.Anycast { + ip := net.ParseIP(ipStr) + if ip == nil { + continue + } + var via net.IP + if ip.To4() != nil { + via = via4 + } else { + via = via6 + } + if via == nil { + warn("anycast " + ipStr + " skipped: pod " + + a.Namespace + "/" + a.PodName + + " has no unicast of same family") + continue + } + key := canonical(ip) + t := out[key] + t.nexthops = append(t.nexthops, anycastNexthop{hostIface: host, via: via}) + out[key] = t + } + } + // Sort each target's nexthops for stable comparison + stable kernel + // ordering. Sort key is canonical(via) — sufficient for stability + // because (host, via) pairs are 1:1 (one veth per pod, one v6+v4 per + // pod, so via uniquely identifies the nexthop). + for k, t := range out { + sort.Slice(t.nexthops, func(i, j int) bool { + return canonical(t.nexthops[i].via) < canonical(t.nexthops[j].via) + }) + out[k] = t + } + return out +} diff --git a/pkg/agent/anycast_linux.go b/pkg/agent/anycast_linux.go index ee70596..4aaf403 100644 --- a/pkg/agent/anycast_linux.go +++ b/pkg/agent/anycast_linux.go @@ -26,6 +26,11 @@ import ( // - Pod transitions to Ready=False or DELETE → remove kernel route, remove // from BIRD export. // +// When more than one Ready pod on this node binds the same anycast IP, the +// kernel route uses RTA_MULTIPATH so the kernel does per-flow ECMP across +// the contributing pods. This is the within-node companion to BGP-level +// ECMP across nodes. +// // Reconcile is idempotent. Triggers: AfterCommit hook, Pod informer // UpdateFunc on Ready transitions, periodic 2s tick. type AnycastReconciler struct { @@ -42,13 +47,6 @@ type AnycastReconciler struct { trigger chan struct{} } -// anycastTarget describes the kernel route shape for one advertised -// anycast IP: which veth, and which pod eth0 IP to use as next-hop. -type anycastTarget struct { - hostIface string - via net.IP -} - // NewAnycastReconciler returns a Reconciler ready to Run. func NewAnycastReconciler(node string, store *Store, pods *PodCache, nc *NodeConfigCache, bird *BirdManager, routerID string, logger *slog.Logger) *AnycastReconciler { return &AnycastReconciler{ @@ -96,25 +94,26 @@ func (r *AnycastReconciler) reconcile() { desired := r.computeDesired() - // Install routes that should exist but don't (or whose target changed). + // Install routes that should exist but don't, or whose nexthop set + // changed. for ip, t := range desired { - if cur, ok := r.advertised[ip]; ok && cur.hostIface == t.hostIface && cur.via.Equal(t.via) { + if cur, ok := r.advertised[ip]; ok && cur.equal(t) { continue } if err := installAnycastRoute(ip, t); err != nil { - r.Logger.Warn("anycast install", "ip", ip, "host", t.hostIface, "via", t.via, "err", err) + r.Logger.Warn("anycast install", "ip", ip, "nexthops", len(t.nexthops), "err", err) continue } - r.Logger.Info("anycast advertise", "ip", ip, "host", t.hostIface, "via", t.via) + r.Logger.Info("anycast advertise", "ip", ip, "nexthops", describeNexthops(t)) r.advertised[ip] = t } // Remove routes that exist but shouldn't. for ip, t := range r.advertised { if _, want := desired[ip]; !want { if err := removeAnycastRoute(ip, t); err != nil { - r.Logger.Warn("anycast remove", "ip", ip, "host", t.hostIface, "err", err) + r.Logger.Warn("anycast remove", "ip", ip, "err", err) } else { - r.Logger.Info("anycast withdraw", "ip", ip, "host", t.hostIface) + r.Logger.Info("anycast withdraw", "ip", ip) } delete(r.advertised, ip) } @@ -124,44 +123,17 @@ func (r *AnycastReconciler) reconcile() { r.renderBird(desired) } -// computeDesired walks the Store and returns the per-ip anycastTarget for -// every anycast advertisement that should be active right now. Each target -// uses the pod's own eth0 IP (same family) as the route's `via` next-hop — -// that way kernel NDP/ARP resolves the eth0 address, which IS configured -// on the pod's eth0, so the pod responds normally without proxy_ndp. +// computeDesired delegates to the pure resolveAnycastTargets and plugs in +// the live informer-based isReady callback. func (r *AnycastReconciler) computeDesired() map[string]anycastTarget { - out := map[string]anycastTarget{} - for _, a := range r.Store.Snapshot() { - if a.State != StateCommitted || len(a.Anycast) == 0 { - continue - } - pod, ok := r.Pods.Get(a.Namespace, a.PodName) - if !ok || !podReady(pod) { - continue - } - host := HostIfaceName(a.ContainerID) - via6 := net.ParseIP(a.IP6) - via4 := net.ParseIP(a.IP4) - for _, ipStr := range a.Anycast { - ip := net.ParseIP(ipStr) - if ip == nil { - continue - } - var via net.IP - if ip.To4() != nil { - via = via4 - } else { - via = via6 - } - if via == nil { - r.Logger.Warn("anycast skipped: pod has no unicast IP of same family", - "pod", a.Namespace+"/"+a.PodName, "anycast", ipStr) - continue - } - out[canonical(ip)] = anycastTarget{hostIface: host, via: via} - } - } - return out + return resolveAnycastTargets( + r.Store.Snapshot(), + func(ns, name string) bool { + pod, ok := r.Pods.Get(ns, name) + return ok && podReady(pod) + }, + func(s string) { r.Logger.Warn(s) }, + ) } func (r *AnycastReconciler) renderBird(desired map[string]anycastTarget) { @@ -186,56 +158,71 @@ func (r *AnycastReconciler) renderBird(desired map[string]anycastTarget) { } } -// installAnycastRoute installs `/<128|32> via t.via dev t.hostIface`. +// installAnycastRoute installs `/<128|32>` pointing at the +// nexthop set in t. With one nexthop the route is a plain via-route; +// with multiple, it's a multipath route using RTA_MULTIPATH so the +// kernel hashes flows across the constituent pods. +// // Idempotent — RouteReplace overwrites a stale entry. func installAnycastRoute(ipStr string, t anycastTarget) error { ip := net.ParseIP(ipStr) if ip == nil { return fmt.Errorf("bad ip %q", ipStr) } - link, err := netlink.LinkByName(t.hostIface) - if err != nil { - return fmt.Errorf("lookup %s: %w", t.hostIface, err) + if len(t.nexthops) == 0 { + return fmt.Errorf("anycast %s: no nexthops", ipStr) } prefix := 128 if ip.To4() != nil { prefix = 32 ip = ip.To4() } - r := &netlink.Route{ - LinkIndex: link.Attrs().Index, - Dst: cidrFor(ip, prefix), - Gw: t.via, - // SCOPE_UNIVERSE — the gateway is on a different "logical" subnet - // than the local /128 route, but reachable on this veth. Linux is - // happy as long as the veth has IPv6 forwarding on (it does — set - // in configureHostSide) and the pod's eth0 has the via address - // (also true — that's the pod's IP6/IP4 we allocated). + r := &netlink.Route{Dst: cidrFor(ip, prefix)} + if len(t.nexthops) == 1 { + // Single nexthop — keep the route shape identical to today's + // production form. Functionally equivalent to a 1-element + // MultiPath but `ip route show` renders nicer for operators. + nh := t.nexthops[0] + link, err := netlink.LinkByName(nh.hostIface) + if err != nil { + return fmt.Errorf("lookup %s: %w", nh.hostIface, err) + } + r.LinkIndex = link.Attrs().Index + r.Gw = nh.via + } else { + hops := make([]*netlink.NexthopInfo, 0, len(t.nexthops)) + for _, nh := range t.nexthops { + link, err := netlink.LinkByName(nh.hostIface) + if err != nil { + return fmt.Errorf("lookup %s: %w", nh.hostIface, err) + } + hops = append(hops, &netlink.NexthopInfo{ + LinkIndex: link.Attrs().Index, + Gw: nh.via, + Hops: 0, + }) + } + r.MultiPath = hops } return netlink.RouteReplace(r) } // removeAnycastRoute deletes the host route. Missing routes / interfaces // are treated as success — DEL paths can race with veth teardown. -func removeAnycastRoute(ipStr string, t anycastTarget) error { +// +// Kernel route deletion matches by destination prefix; we don't need to +// re-specify the nexthop set. +func removeAnycastRoute(ipStr string, _ anycastTarget) error { ip := net.ParseIP(ipStr) if ip == nil { return nil } - link, err := netlink.LinkByName(t.hostIface) - if err != nil { - return nil - } prefix := 128 if ip.To4() != nil { prefix = 32 ip = ip.To4() } - r := &netlink.Route{ - LinkIndex: link.Attrs().Index, - Dst: cidrFor(ip, prefix), - Gw: t.via, - } + r := &netlink.Route{Dst: cidrFor(ip, prefix)} if err := netlink.RouteDel(r); err != nil { // ESRCH ("no such process") is netlink-speak for "no such route"; // treat as success. @@ -247,5 +234,17 @@ func removeAnycastRoute(ipStr string, t anycastTarget) error { return nil } +// describeNexthops returns a compact string for log messages. +func describeNexthops(t anycastTarget) string { + var s string + for i, nh := range t.nexthops { + if i > 0 { + s += "," + } + s += nh.hostIface + "→" + nh.via.String() + } + return s +} + // _ = flockv1alpha1 to silence unused import warnings on minimal builds. var _ = flockv1alpha1.GroupName diff --git a/pkg/agent/anycast_test.go b/pkg/agent/anycast_test.go new file mode 100644 index 0000000..5c3c41e --- /dev/null +++ b/pkg/agent/anycast_test.go @@ -0,0 +1,227 @@ +package agent + +import ( + "net" + "strings" + "testing" +) + +// allReady is a convenience isReady that says yes to every pod. +func allReady(_, _ string) bool { return true } + +// readyOnly returns an isReady that only says yes to the named pods. +func readyOnly(want ...string) func(string, string) bool { + set := map[string]struct{}{} + for _, n := range want { + set[n] = struct{}{} + } + return func(_, name string) bool { + _, ok := set[name] + return ok + } +} + +func TestResolveAnycastTargets_OnePodOneAnycast(t *testing.T) { + allocs := []Allocation{{ + ContainerID: "c1", Namespace: "ns", PodName: "pod-a", + State: StateCommitted, + IP6: "2001:db8::1", + Anycast: []string{"2001:db8:a::1"}, + }} + out := resolveAnycastTargets(allocs, allReady, nil) + if len(out) != 1 { + t.Fatalf("expected 1 anycast IP, got %d", len(out)) + } + tgt, ok := out["2001:db8:a::1"] + if !ok { + t.Fatalf("missing target") + } + if len(tgt.nexthops) != 1 { + t.Fatalf("expected 1 nexthop, got %d", len(tgt.nexthops)) + } + if !tgt.nexthops[0].via.Equal(net.ParseIP("2001:db8::1")) { + t.Fatalf("nexthop via wrong: %v", tgt.nexthops[0].via) + } +} + +// Two pods on the same node binding the same anycast IP must produce a +// SINGLE target with TWO nexthops. The previous behaviour (overwriting) +// was the bug this whole change exists to fix. +func TestResolveAnycastTargets_TwoPodsSameAnycast_MultiNexthop(t *testing.T) { + allocs := []Allocation{ + {ContainerID: "c1", Namespace: "ns", PodName: "pod-a", + State: StateCommitted, IP6: "2001:db8::2", + Anycast: []string{"2001:db8:a::1"}}, + {ContainerID: "c2", Namespace: "ns", PodName: "pod-b", + State: StateCommitted, IP6: "2001:db8::1", + Anycast: []string{"2001:db8:a::1"}}, + } + out := resolveAnycastTargets(allocs, allReady, nil) + tgt := out["2001:db8:a::1"] + if len(tgt.nexthops) != 2 { + t.Fatalf("expected 2 nexthops, got %d", len(tgt.nexthops)) + } + // Order should be sorted by canonical(via) — ::1 before ::2. + if !tgt.nexthops[0].via.Equal(net.ParseIP("2001:db8::1")) { + t.Fatalf("nexthops not sorted by via; got %v first", tgt.nexthops[0].via) + } + if !tgt.nexthops[1].via.Equal(net.ParseIP("2001:db8::2")) { + t.Fatalf("nexthops not sorted by via; got %v second", tgt.nexthops[1].via) + } + // HostIface differs per pod (different containerID → different FNV). + if tgt.nexthops[0].hostIface == tgt.nexthops[1].hostIface { + t.Fatalf("expected distinct hostIfaces, both %q", tgt.nexthops[0].hostIface) + } +} + +// When one of the contributing pods goes NotReady, only the remaining +// Ready pod should appear in the target's nexthop set. +func TestResolveAnycastTargets_NotReadyDropped(t *testing.T) { + allocs := []Allocation{ + {ContainerID: "c1", Namespace: "ns", PodName: "pod-a", + State: StateCommitted, IP6: "2001:db8::1", + Anycast: []string{"2001:db8:a::1"}}, + {ContainerID: "c2", Namespace: "ns", PodName: "pod-b", + State: StateCommitted, IP6: "2001:db8::2", + Anycast: []string{"2001:db8:a::1"}}, + } + out := resolveAnycastTargets(allocs, readyOnly("pod-a"), nil) + tgt := out["2001:db8:a::1"] + if len(tgt.nexthops) != 1 { + t.Fatalf("expected 1 nexthop after NotReady drop, got %d", len(tgt.nexthops)) + } + if !tgt.nexthops[0].via.Equal(net.ParseIP("2001:db8::1")) { + t.Fatalf("wrong surviving nexthop: %v", tgt.nexthops[0].via) + } +} + +// Pods that haven't reached Ready are excluded entirely from the target +// set. If no pod is Ready for an anycast IP, that IP is absent from the +// output (BIRD will withdraw from BGP, kernel route will be removed). +func TestResolveAnycastTargets_NoReadyPodsOmitsIP(t *testing.T) { + allocs := []Allocation{ + {ContainerID: "c1", Namespace: "ns", PodName: "pod-a", + State: StateCommitted, IP6: "2001:db8::1", + Anycast: []string{"2001:db8:a::1"}}, + } + out := resolveAnycastTargets(allocs, readyOnly( /* none */ ), nil) + if _, ok := out["2001:db8:a::1"]; ok { + t.Fatalf("anycast should be absent when no pod ready") + } +} + +// Pending allocations (CNI ADD partway through) are skipped even if the +// pod is Ready — we don't program kernel routes for partial setups. +func TestResolveAnycastTargets_PendingSkipped(t *testing.T) { + allocs := []Allocation{ + {ContainerID: "c1", Namespace: "ns", PodName: "pod-a", + State: StatePending, IP6: "2001:db8::1", + Anycast: []string{"2001:db8:a::1"}}, + } + out := resolveAnycastTargets(allocs, allReady, nil) + if len(out) != 0 { + t.Fatalf("pending allocations must be skipped") + } +} + +// Mixed v6+v4 anycast on the same pod produces two separate target +// entries, one per family, each anchored on the matching unicast IP. +func TestResolveAnycastTargets_MixedFamilies(t *testing.T) { + allocs := []Allocation{{ + ContainerID: "c1", Namespace: "ns", PodName: "pod-a", + State: StateCommitted, + IP6: "2001:db8::1", + IP4: "10.0.0.1", + Anycast: []string{"2001:db8:a::1", "10.255.0.1"}, + }} + out := resolveAnycastTargets(allocs, allReady, nil) + if !out["2001:db8:a::1"].nexthops[0].via.Equal(net.ParseIP("2001:db8::1")) { + t.Fatalf("v6 anycast should resolve via v6 unicast") + } + if !out["10.255.0.1"].nexthops[0].via.Equal(net.ParseIP("10.0.0.1").To4()) { + t.Fatalf("v4 anycast should resolve via v4 unicast") + } +} + +// An anycast whose family has no matching unicast on the pod is skipped +// with a warning. Other anycast IPs on the same pod are unaffected. +func TestResolveAnycastTargets_FamilyMismatchWarns(t *testing.T) { + allocs := []Allocation{{ + ContainerID: "c1", Namespace: "ns", PodName: "pod-a", + State: StateCommitted, + IP6: "2001:db8::1", // v6 only + Anycast: []string{"2001:db8:a::1", "10.255.0.1"}, + }} + var warns []string + out := resolveAnycastTargets(allocs, allReady, func(s string) { warns = append(warns, s) }) + if _, has := out["2001:db8:a::1"]; !has { + t.Fatalf("v6 anycast should have been programmed") + } + if _, has := out["10.255.0.1"]; has { + t.Fatalf("v4 anycast should have been skipped") + } + if len(warns) != 1 { + t.Fatalf("expected 1 warning, got %d: %v", len(warns), warns) + } + if !strings.Contains(warns[0], "10.255.0.1") { + t.Fatalf("warning should mention skipped IP: %q", warns[0]) + } +} + +// Determinism: the same input must produce nexthops in the same order. +func TestResolveAnycastTargets_Determinism(t *testing.T) { + allocs := []Allocation{ + {ContainerID: "z-late", Namespace: "ns", PodName: "z", + State: StateCommitted, IP6: "2001:db8::5", + Anycast: []string{"2001:db8:a::1"}}, + {ContainerID: "a-early", Namespace: "ns", PodName: "a", + State: StateCommitted, IP6: "2001:db8::3", + Anycast: []string{"2001:db8:a::1"}}, + {ContainerID: "m-mid", Namespace: "ns", PodName: "m", + State: StateCommitted, IP6: "2001:db8::4", + Anycast: []string{"2001:db8:a::1"}}, + } + a := resolveAnycastTargets(allocs, allReady, nil) + b := resolveAnycastTargets(allocs, allReady, nil) + if !a["2001:db8:a::1"].equal(b["2001:db8:a::1"]) { + t.Fatalf("same input produced unequal targets") + } + // Sorted by canonical(via): ::3, ::4, ::5 + via := a["2001:db8:a::1"].nexthops + if !via[0].via.Equal(net.ParseIP("2001:db8::3")) || + !via[1].via.Equal(net.ParseIP("2001:db8::4")) || + !via[2].via.Equal(net.ParseIP("2001:db8::5")) { + t.Fatalf("nexthops not stably sorted: %v %v %v", via[0].via, via[1].via, via[2].via) + } +} + +// equal()'s contract — different orderings are still considered equal +// AS LONG AS both sides have been canonicalised by resolveAnycastTargets. +// Across-call comparisons of resolver outputs must always match for the +// same logical input. +func TestAnycastTarget_Equal(t *testing.T) { + a := anycastTarget{nexthops: []anycastNexthop{ + {hostIface: "f1", via: net.ParseIP("2001:db8::1")}, + {hostIface: "f2", via: net.ParseIP("2001:db8::2")}, + }} + b := anycastTarget{nexthops: []anycastNexthop{ + {hostIface: "f1", via: net.ParseIP("2001:db8::1")}, + {hostIface: "f2", via: net.ParseIP("2001:db8::2")}, + }} + if !a.equal(b) { + t.Fatalf("equal targets reported unequal") + } + c := anycastTarget{nexthops: []anycastNexthop{ + {hostIface: "f1", via: net.ParseIP("2001:db8::1")}, + }} + if a.equal(c) { + t.Fatalf("targets with different lengths reported equal") + } + d := anycastTarget{nexthops: []anycastNexthop{ + {hostIface: "f1", via: net.ParseIP("2001:db8::1")}, + {hostIface: "f2", via: net.ParseIP("2001:db8::3")}, // diff IP + }} + if a.equal(d) { + t.Fatalf("targets with different vias reported equal") + } +} diff --git a/pkg/agent/runtime_linux.go b/pkg/agent/runtime_linux.go index 7913b34..517fa5c 100644 --- a/pkg/agent/runtime_linux.go +++ b/pkg/agent/runtime_linux.go @@ -6,11 +6,36 @@ import ( "context" "fmt" "net" + "os" "time" "code.fritzlab.net/fritzlab/flock/pkg/agent/netpol" ) +// hostMultipathHashSysctls is the set of node-level sysctls flock-agent +// best-effort writes at startup. Default policy 0 hashes only on +// (saddr, daddr); policy 1 adds L4 (sport, dport, proto), giving real +// per-connection ECMP across multipath nexthops — required for sensible +// distribution across multiple anycast pods on the same node. +var hostMultipathHashSysctls = map[string]string{ + "/proc/sys/net/ipv4/fib_multipath_hash_policy": "1", + "/proc/sys/net/ipv6/fib_multipath_hash_policy": "1", +} + +// applyHostSysctls writes the sysctls in m, logging but not failing on +// errors. flock-agent is privileged so this works in the production +// DaemonSet; in environments where it doesn't, single-pod-per-node +// anycast still works (this only affects the multi-pod-per-node case). +func applyHostSysctls(s *Server) { + for path, value := range hostMultipathHashSysctls { + if err := os.WriteFile(path, []byte(value), 0o644); err != nil { + s.Logger.Warn("set host sysctl", "path", path, "value", value, "err", err) + continue + } + s.Logger.Info("host sysctl set", "path", path, "value", value) + } +} + // configureRuntime wires Pod informer, IPAM, netlink, and BIRD on a real // Linux node. Steps: // @@ -23,6 +48,8 @@ import ( // 5. Build PodHandler and SetHandlers(add, del, check). // 6. Install BIRD blackhole summary routes + render initial config. func (s *Server) configureRuntime(ctx context.Context) error { + applyHostSysctls(s) + if err := s.firstAvailableNodeConfig(ctx, 60*time.Second); err != nil { return err }