anycast: kernel multipath route + L4 hash for multi-pod-per-node
Build flock Image / build (push) Has been cancelled

Move pure resolver logic out of anycast_linux.go into anycast.go so it's
unit-testable on any host. Reshape anycastTarget from a single
{hostIface, via} into a sorted list of nexthops; multiple Ready pods on
the same node binding the same anycast IP now contribute one nexthop
each.

installAnycastRoute uses RTA_MULTIPATH (via netlink.Route.MultiPath)
when the target has more than one nexthop. Single-nexthop targets keep
the simple via-route shape so 1-pod-per-node keeps rendering identically
to today's production form in `ip route show`.

flock-agent writes net.ipv{4,6}.fib_multipath_hash_policy = 1 at
startup so the kernel hashes flows on (saddr, daddr, sport, dport, proto)
rather than just IPs. Best-effort — runs privileged in production, so
it works; falls back to L3 hash on environments where the write fails
(only matters for the multi-pod-per-node case anyway).

resolveAnycastTargets sorts nexthops by canonical(via) for stable
comparison so a quiet reconcile pass doesn't churn the kernel route.

8 new unit tests cover: 1-pod, 2-pods-same-anycast (multi-nexthop),
NotReady drop, no-Ready omits the IP, pending skipped, mixed v6+v4,
family mismatch warns, determinism.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Donavan Fritz
2026-04-25 09:57:32 -05:00
parent 5d9b6bfeec
commit a7dc7bf1f4
4 changed files with 436 additions and 73 deletions
+72 -73
View File
@@ -26,6 +26,11 @@ import (
// - Pod transitions to Ready=False or DELETE → remove kernel route, remove
// from BIRD export.
//
// When more than one Ready pod on this node binds the same anycast IP, the
// kernel route uses RTA_MULTIPATH so the kernel does per-flow ECMP across
// the contributing pods. This is the within-node companion to BGP-level
// ECMP across nodes.
//
// Reconcile is idempotent. Triggers: AfterCommit hook, Pod informer
// UpdateFunc on Ready transitions, periodic 2s tick.
type AnycastReconciler struct {
@@ -42,13 +47,6 @@ type AnycastReconciler struct {
trigger chan struct{}
}
// anycastTarget describes the kernel route shape for one advertised
// anycast IP: which veth, and which pod eth0 IP to use as next-hop.
type anycastTarget struct {
hostIface string
via net.IP
}
// NewAnycastReconciler returns a Reconciler ready to Run.
func NewAnycastReconciler(node string, store *Store, pods *PodCache, nc *NodeConfigCache, bird *BirdManager, routerID string, logger *slog.Logger) *AnycastReconciler {
return &AnycastReconciler{
@@ -96,25 +94,26 @@ func (r *AnycastReconciler) reconcile() {
desired := r.computeDesired()
// Install routes that should exist but don't (or whose target changed).
// Install routes that should exist but don't, or whose nexthop set
// changed.
for ip, t := range desired {
if cur, ok := r.advertised[ip]; ok && cur.hostIface == t.hostIface && cur.via.Equal(t.via) {
if cur, ok := r.advertised[ip]; ok && cur.equal(t) {
continue
}
if err := installAnycastRoute(ip, t); err != nil {
r.Logger.Warn("anycast install", "ip", ip, "host", t.hostIface, "via", t.via, "err", err)
r.Logger.Warn("anycast install", "ip", ip, "nexthops", len(t.nexthops), "err", err)
continue
}
r.Logger.Info("anycast advertise", "ip", ip, "host", t.hostIface, "via", t.via)
r.Logger.Info("anycast advertise", "ip", ip, "nexthops", describeNexthops(t))
r.advertised[ip] = t
}
// Remove routes that exist but shouldn't.
for ip, t := range r.advertised {
if _, want := desired[ip]; !want {
if err := removeAnycastRoute(ip, t); err != nil {
r.Logger.Warn("anycast remove", "ip", ip, "host", t.hostIface, "err", err)
r.Logger.Warn("anycast remove", "ip", ip, "err", err)
} else {
r.Logger.Info("anycast withdraw", "ip", ip, "host", t.hostIface)
r.Logger.Info("anycast withdraw", "ip", ip)
}
delete(r.advertised, ip)
}
@@ -124,44 +123,17 @@ func (r *AnycastReconciler) reconcile() {
r.renderBird(desired)
}
// computeDesired walks the Store and returns the per-ip anycastTarget for
// every anycast advertisement that should be active right now. Each target
// uses the pod's own eth0 IP (same family) as the route's `via` next-hop —
// that way kernel NDP/ARP resolves the eth0 address, which IS configured
// on the pod's eth0, so the pod responds normally without proxy_ndp.
// computeDesired delegates to the pure resolveAnycastTargets and plugs in
// the live informer-based isReady callback.
func (r *AnycastReconciler) computeDesired() map[string]anycastTarget {
out := map[string]anycastTarget{}
for _, a := range r.Store.Snapshot() {
if a.State != StateCommitted || len(a.Anycast) == 0 {
continue
}
pod, ok := r.Pods.Get(a.Namespace, a.PodName)
if !ok || !podReady(pod) {
continue
}
host := HostIfaceName(a.ContainerID)
via6 := net.ParseIP(a.IP6)
via4 := net.ParseIP(a.IP4)
for _, ipStr := range a.Anycast {
ip := net.ParseIP(ipStr)
if ip == nil {
continue
}
var via net.IP
if ip.To4() != nil {
via = via4
} else {
via = via6
}
if via == nil {
r.Logger.Warn("anycast skipped: pod has no unicast IP of same family",
"pod", a.Namespace+"/"+a.PodName, "anycast", ipStr)
continue
}
out[canonical(ip)] = anycastTarget{hostIface: host, via: via}
}
}
return out
return resolveAnycastTargets(
r.Store.Snapshot(),
func(ns, name string) bool {
pod, ok := r.Pods.Get(ns, name)
return ok && podReady(pod)
},
func(s string) { r.Logger.Warn(s) },
)
}
func (r *AnycastReconciler) renderBird(desired map[string]anycastTarget) {
@@ -186,56 +158,71 @@ func (r *AnycastReconciler) renderBird(desired map[string]anycastTarget) {
}
}
// installAnycastRoute installs `<ipStr>/<128|32> via t.via dev t.hostIface`.
// installAnycastRoute installs `<ipStr>/<128|32>` pointing at the
// nexthop set in t. With one nexthop the route is a plain via-route;
// with multiple, it's a multipath route using RTA_MULTIPATH so the
// kernel hashes flows across the constituent pods.
//
// Idempotent — RouteReplace overwrites a stale entry.
func installAnycastRoute(ipStr string, t anycastTarget) error {
ip := net.ParseIP(ipStr)
if ip == nil {
return fmt.Errorf("bad ip %q", ipStr)
}
link, err := netlink.LinkByName(t.hostIface)
if err != nil {
return fmt.Errorf("lookup %s: %w", t.hostIface, err)
if len(t.nexthops) == 0 {
return fmt.Errorf("anycast %s: no nexthops", ipStr)
}
prefix := 128
if ip.To4() != nil {
prefix = 32
ip = ip.To4()
}
r := &netlink.Route{
LinkIndex: link.Attrs().Index,
Dst: cidrFor(ip, prefix),
Gw: t.via,
// SCOPE_UNIVERSE — the gateway is on a different "logical" subnet
// than the local /128 route, but reachable on this veth. Linux is
// happy as long as the veth has IPv6 forwarding on (it does — set
// in configureHostSide) and the pod's eth0 has the via address
// (also true — that's the pod's IP6/IP4 we allocated).
r := &netlink.Route{Dst: cidrFor(ip, prefix)}
if len(t.nexthops) == 1 {
// Single nexthop — keep the route shape identical to today's
// production form. Functionally equivalent to a 1-element
// MultiPath but `ip route show` renders nicer for operators.
nh := t.nexthops[0]
link, err := netlink.LinkByName(nh.hostIface)
if err != nil {
return fmt.Errorf("lookup %s: %w", nh.hostIface, err)
}
r.LinkIndex = link.Attrs().Index
r.Gw = nh.via
} else {
hops := make([]*netlink.NexthopInfo, 0, len(t.nexthops))
for _, nh := range t.nexthops {
link, err := netlink.LinkByName(nh.hostIface)
if err != nil {
return fmt.Errorf("lookup %s: %w", nh.hostIface, err)
}
hops = append(hops, &netlink.NexthopInfo{
LinkIndex: link.Attrs().Index,
Gw: nh.via,
Hops: 0,
})
}
r.MultiPath = hops
}
return netlink.RouteReplace(r)
}
// removeAnycastRoute deletes the host route. Missing routes / interfaces
// are treated as success — DEL paths can race with veth teardown.
func removeAnycastRoute(ipStr string, t anycastTarget) error {
//
// Kernel route deletion matches by destination prefix; we don't need to
// re-specify the nexthop set.
func removeAnycastRoute(ipStr string, _ anycastTarget) error {
ip := net.ParseIP(ipStr)
if ip == nil {
return nil
}
link, err := netlink.LinkByName(t.hostIface)
if err != nil {
return nil
}
prefix := 128
if ip.To4() != nil {
prefix = 32
ip = ip.To4()
}
r := &netlink.Route{
LinkIndex: link.Attrs().Index,
Dst: cidrFor(ip, prefix),
Gw: t.via,
}
r := &netlink.Route{Dst: cidrFor(ip, prefix)}
if err := netlink.RouteDel(r); err != nil {
// ESRCH ("no such process") is netlink-speak for "no such route";
// treat as success.
@@ -247,5 +234,17 @@ func removeAnycastRoute(ipStr string, t anycastTarget) error {
return nil
}
// describeNexthops returns a compact string for log messages.
func describeNexthops(t anycastTarget) string {
var s string
for i, nh := range t.nexthops {
if i > 0 {
s += ","
}
s += nh.hostIface + "→" + nh.via.String()
}
return s
}
// _ = flockv1alpha1 to silence unused import warnings on minimal builds.
var _ = flockv1alpha1.GroupName