Files
flock/pkg/agent/anycast_linux.go
T
Donavan Fritz 2082df37e5
Build flock Image / build (push) Has been cancelled
anycast: revert to lo + add via=pod-eth0 next-hop on host route
Reverts the eth0-placement hack from e1e9544. The design doc's lo
placement is correct.

Real fix: the host's anycast /128 (or /32) route now uses the pod's own
eth0 unicast IP (same family) as the route's `via` next-hop. The kernel
then does NDP/ARP for that eth0 IP — which IS configured on the pod's
eth0 — so the pod responds normally with no proxy_ndp / proxy_arp
trickery on the anycast IP itself.

  ip -6 route add <anycast>/128 via <pod-eth0-v6> dev flock<8hex>
  ip -4 route add <anycast>/32  via <pod-eth0-v4> dev flock<8hex>

Validation: an anycast IP whose family the pod doesn't have a unicast
for is skipped with a warn (an v4 anycast on an IPv6-only pod cannot be
NDP-resolved this way; require dual-stack).

Bonus cleanup: ESRCH from RouteDel is treated as success (idempotent).

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:02:51 -05:00

252 lines
6.8 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//go:build linux
package agent
import (
"context"
"errors"
"fmt"
"log/slog"
"net"
"sync"
"syscall"
"time"
flockv1alpha1 "code.fritzlab.net/fritzlab/flock/pkg/api/v1alpha1"
"github.com/vishvananda/netlink"
)
// AnycastReconciler keeps the kernel's anycast host routes and BIRD's
// advertised set in sync with (committed allocations × pod Ready).
//
// Lifecycle (per design doc):
// - CNI ADD assigns anycast IPs to pod lo (already done in netns_linux.go).
// - Pod transitions to Ready=True → install host /128 (or /32) route at
// `dev flock<8hex>` and add the IP to BIRD's export filter.
// - Pod transitions to Ready=False or DELETE → remove kernel route, remove
// from BIRD export.
//
// Reconcile is idempotent. Triggers: AfterCommit hook, Pod informer
// UpdateFunc on Ready transitions, periodic 2s tick.
type AnycastReconciler struct {
Node string
Store *Store
Pods *PodCache
NodeConfig *NodeConfigCache
Bird *BirdManager
RouterID string
Logger *slog.Logger
mu sync.Mutex
advertised map[string]anycastTarget // canonical IP → install info
trigger chan struct{}
}
// anycastTarget describes the kernel route shape for one advertised
// anycast IP: which veth, and which pod eth0 IP to use as next-hop.
type anycastTarget struct {
hostIface string
via net.IP
}
// NewAnycastReconciler returns a Reconciler ready to Run.
func NewAnycastReconciler(node string, store *Store, pods *PodCache, nc *NodeConfigCache, bird *BirdManager, routerID string, logger *slog.Logger) *AnycastReconciler {
return &AnycastReconciler{
Node: node,
Store: store,
Pods: pods,
NodeConfig: nc,
Bird: bird,
RouterID: routerID,
Logger: logger,
advertised: map[string]anycastTarget{},
trigger: make(chan struct{}, 1),
}
}
// Trigger requests one reconcile pass. Coalesces — if a pass is already
// pending, the call is a no-op.
func (r *AnycastReconciler) Trigger() {
select {
case r.trigger <- struct{}{}:
default:
}
}
// Run blocks until ctx is cancelled. Reconciles on Trigger or every 2s.
func (r *AnycastReconciler) Run(ctx context.Context) {
t := time.NewTicker(2 * time.Second)
defer t.Stop()
r.reconcile() // initial pass
for {
select {
case <-ctx.Done():
return
case <-t.C:
r.reconcile()
case <-r.trigger:
r.reconcile()
}
}
}
func (r *AnycastReconciler) reconcile() {
r.mu.Lock()
defer r.mu.Unlock()
desired := r.computeDesired()
// Install routes that should exist but don't (or whose target changed).
for ip, t := range desired {
if cur, ok := r.advertised[ip]; ok && cur.hostIface == t.hostIface && cur.via.Equal(t.via) {
continue
}
if err := installAnycastRoute(ip, t); err != nil {
r.Logger.Warn("anycast install", "ip", ip, "host", t.hostIface, "via", t.via, "err", err)
continue
}
r.Logger.Info("anycast advertise", "ip", ip, "host", t.hostIface, "via", t.via)
r.advertised[ip] = t
}
// Remove routes that exist but shouldn't.
for ip, t := range r.advertised {
if _, want := desired[ip]; !want {
if err := removeAnycastRoute(ip, t); err != nil {
r.Logger.Warn("anycast remove", "ip", ip, "host", t.hostIface, "err", err)
} else {
r.Logger.Info("anycast withdraw", "ip", ip, "host", t.hostIface)
}
delete(r.advertised, ip)
}
}
// Re-render BIRD with the active set.
r.renderBird(desired)
}
// computeDesired walks the Store and returns the per-ip anycastTarget for
// every anycast advertisement that should be active right now. Each target
// uses the pod's own eth0 IP (same family) as the route's `via` next-hop —
// that way kernel NDP/ARP resolves the eth0 address, which IS configured
// on the pod's eth0, so the pod responds normally without proxy_ndp.
func (r *AnycastReconciler) computeDesired() map[string]anycastTarget {
out := map[string]anycastTarget{}
for _, a := range r.Store.Snapshot() {
if a.State != StateCommitted || len(a.Anycast) == 0 {
continue
}
pod, ok := r.Pods.Get(a.Namespace, a.PodName)
if !ok || !podReady(pod) {
continue
}
host := HostIfaceName(a.ContainerID)
via6 := net.ParseIP(a.IP6)
via4 := net.ParseIP(a.IP4)
for _, ipStr := range a.Anycast {
ip := net.ParseIP(ipStr)
if ip == nil {
continue
}
var via net.IP
if ip.To4() != nil {
via = via4
} else {
via = via6
}
if via == nil {
r.Logger.Warn("anycast skipped: pod has no unicast IP of same family",
"pod", a.Namespace+"/"+a.PodName, "anycast", ipStr)
continue
}
out[canonical(ip)] = anycastTarget{hostIface: host, via: via}
}
}
return out
}
func (r *AnycastReconciler) renderBird(desired map[string]anycastTarget) {
nc := r.NodeConfig.Load()
if nc == nil || r.Bird == nil {
return
}
var v6, v4 []string
for ipStr := range desired {
ip := net.ParseIP(ipStr)
if ip == nil {
continue
}
if ip.To4() != nil {
v4 = append(v4, ip.To4().String())
} else {
v6 = append(v6, ip.To16().String())
}
}
if err := r.Bird.Render(nc, v6, v4, r.RouterID); err != nil {
r.Logger.Warn("anycast bird render", "err", err)
}
}
// installAnycastRoute installs `<ipStr>/<128|32> via t.via dev t.hostIface`.
// Idempotent — RouteReplace overwrites a stale entry.
func installAnycastRoute(ipStr string, t anycastTarget) error {
ip := net.ParseIP(ipStr)
if ip == nil {
return fmt.Errorf("bad ip %q", ipStr)
}
link, err := netlink.LinkByName(t.hostIface)
if err != nil {
return fmt.Errorf("lookup %s: %w", t.hostIface, err)
}
prefix := 128
if ip.To4() != nil {
prefix = 32
ip = ip.To4()
}
r := &netlink.Route{
LinkIndex: link.Attrs().Index,
Dst: cidrFor(ip, prefix),
Gw: t.via,
// SCOPE_UNIVERSE — the gateway is on a different "logical" subnet
// than the local /128 route, but reachable on this veth. Linux is
// happy as long as the veth has IPv6 forwarding on (it does — set
// in configureHostSide) and the pod's eth0 has the via address
// (also true — that's the pod's IP6/IP4 we allocated).
}
return netlink.RouteReplace(r)
}
// removeAnycastRoute deletes the host route. Missing routes / interfaces
// are treated as success — DEL paths can race with veth teardown.
func removeAnycastRoute(ipStr string, t anycastTarget) error {
ip := net.ParseIP(ipStr)
if ip == nil {
return nil
}
link, err := netlink.LinkByName(t.hostIface)
if err != nil {
return nil
}
prefix := 128
if ip.To4() != nil {
prefix = 32
ip = ip.To4()
}
r := &netlink.Route{
LinkIndex: link.Attrs().Index,
Dst: cidrFor(ip, prefix),
Gw: t.via,
}
if err := netlink.RouteDel(r); err != nil {
// ESRCH ("no such process") is netlink-speak for "no such route";
// treat as success.
if errors.Is(err, syscall.ESRCH) || linkNotFound(err) {
return nil
}
return err
}
return nil
}
// _ = flockv1alpha1 to silence unused import warnings on minimal builds.
var _ = flockv1alpha1.GroupName