2082df37e5
Build flock Image / build (push) Has been cancelled
Reverts the eth0-placement hack from e1e9544. The design doc's lo
placement is correct.
Real fix: the host's anycast /128 (or /32) route now uses the pod's own
eth0 unicast IP (same family) as the route's `via` next-hop. The kernel
then does NDP/ARP for that eth0 IP — which IS configured on the pod's
eth0 — so the pod responds normally with no proxy_ndp / proxy_arp
trickery on the anycast IP itself.
ip -6 route add <anycast>/128 via <pod-eth0-v6> dev flock<8hex>
ip -4 route add <anycast>/32 via <pod-eth0-v4> dev flock<8hex>
Validation: an anycast IP whose family the pod doesn't have a unicast
for is skipped with a warn (an v4 anycast on an IPv6-only pod cannot be
NDP-resolved this way; require dual-stack).
Bonus cleanup: ESRCH from RouteDel is treated as success (idempotent).
Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
252 lines
6.8 KiB
Go
252 lines
6.8 KiB
Go
//go:build linux
|
||
|
||
package agent
|
||
|
||
import (
|
||
"context"
|
||
"errors"
|
||
"fmt"
|
||
"log/slog"
|
||
"net"
|
||
"sync"
|
||
"syscall"
|
||
"time"
|
||
|
||
flockv1alpha1 "code.fritzlab.net/fritzlab/flock/pkg/api/v1alpha1"
|
||
"github.com/vishvananda/netlink"
|
||
)
|
||
|
||
// AnycastReconciler keeps the kernel's anycast host routes and BIRD's
|
||
// advertised set in sync with (committed allocations × pod Ready).
|
||
//
|
||
// Lifecycle (per design doc):
|
||
// - CNI ADD assigns anycast IPs to pod lo (already done in netns_linux.go).
|
||
// - Pod transitions to Ready=True → install host /128 (or /32) route at
|
||
// `dev flock<8hex>` and add the IP to BIRD's export filter.
|
||
// - Pod transitions to Ready=False or DELETE → remove kernel route, remove
|
||
// from BIRD export.
|
||
//
|
||
// Reconcile is idempotent. Triggers: AfterCommit hook, Pod informer
|
||
// UpdateFunc on Ready transitions, periodic 2s tick.
|
||
type AnycastReconciler struct {
|
||
Node string
|
||
Store *Store
|
||
Pods *PodCache
|
||
NodeConfig *NodeConfigCache
|
||
Bird *BirdManager
|
||
RouterID string
|
||
Logger *slog.Logger
|
||
|
||
mu sync.Mutex
|
||
advertised map[string]anycastTarget // canonical IP → install info
|
||
trigger chan struct{}
|
||
}
|
||
|
||
// anycastTarget describes the kernel route shape for one advertised
|
||
// anycast IP: which veth, and which pod eth0 IP to use as next-hop.
|
||
type anycastTarget struct {
|
||
hostIface string
|
||
via net.IP
|
||
}
|
||
|
||
// NewAnycastReconciler returns a Reconciler ready to Run.
|
||
func NewAnycastReconciler(node string, store *Store, pods *PodCache, nc *NodeConfigCache, bird *BirdManager, routerID string, logger *slog.Logger) *AnycastReconciler {
|
||
return &AnycastReconciler{
|
||
Node: node,
|
||
Store: store,
|
||
Pods: pods,
|
||
NodeConfig: nc,
|
||
Bird: bird,
|
||
RouterID: routerID,
|
||
Logger: logger,
|
||
advertised: map[string]anycastTarget{},
|
||
trigger: make(chan struct{}, 1),
|
||
}
|
||
}
|
||
|
||
// Trigger requests one reconcile pass. Coalesces — if a pass is already
|
||
// pending, the call is a no-op.
|
||
func (r *AnycastReconciler) Trigger() {
|
||
select {
|
||
case r.trigger <- struct{}{}:
|
||
default:
|
||
}
|
||
}
|
||
|
||
// Run blocks until ctx is cancelled. Reconciles on Trigger or every 2s.
|
||
func (r *AnycastReconciler) Run(ctx context.Context) {
|
||
t := time.NewTicker(2 * time.Second)
|
||
defer t.Stop()
|
||
r.reconcile() // initial pass
|
||
for {
|
||
select {
|
||
case <-ctx.Done():
|
||
return
|
||
case <-t.C:
|
||
r.reconcile()
|
||
case <-r.trigger:
|
||
r.reconcile()
|
||
}
|
||
}
|
||
}
|
||
|
||
func (r *AnycastReconciler) reconcile() {
|
||
r.mu.Lock()
|
||
defer r.mu.Unlock()
|
||
|
||
desired := r.computeDesired()
|
||
|
||
// Install routes that should exist but don't (or whose target changed).
|
||
for ip, t := range desired {
|
||
if cur, ok := r.advertised[ip]; ok && cur.hostIface == t.hostIface && cur.via.Equal(t.via) {
|
||
continue
|
||
}
|
||
if err := installAnycastRoute(ip, t); err != nil {
|
||
r.Logger.Warn("anycast install", "ip", ip, "host", t.hostIface, "via", t.via, "err", err)
|
||
continue
|
||
}
|
||
r.Logger.Info("anycast advertise", "ip", ip, "host", t.hostIface, "via", t.via)
|
||
r.advertised[ip] = t
|
||
}
|
||
// Remove routes that exist but shouldn't.
|
||
for ip, t := range r.advertised {
|
||
if _, want := desired[ip]; !want {
|
||
if err := removeAnycastRoute(ip, t); err != nil {
|
||
r.Logger.Warn("anycast remove", "ip", ip, "host", t.hostIface, "err", err)
|
||
} else {
|
||
r.Logger.Info("anycast withdraw", "ip", ip, "host", t.hostIface)
|
||
}
|
||
delete(r.advertised, ip)
|
||
}
|
||
}
|
||
|
||
// Re-render BIRD with the active set.
|
||
r.renderBird(desired)
|
||
}
|
||
|
||
// computeDesired walks the Store and returns the per-ip anycastTarget for
|
||
// every anycast advertisement that should be active right now. Each target
|
||
// uses the pod's own eth0 IP (same family) as the route's `via` next-hop —
|
||
// that way kernel NDP/ARP resolves the eth0 address, which IS configured
|
||
// on the pod's eth0, so the pod responds normally without proxy_ndp.
|
||
func (r *AnycastReconciler) computeDesired() map[string]anycastTarget {
|
||
out := map[string]anycastTarget{}
|
||
for _, a := range r.Store.Snapshot() {
|
||
if a.State != StateCommitted || len(a.Anycast) == 0 {
|
||
continue
|
||
}
|
||
pod, ok := r.Pods.Get(a.Namespace, a.PodName)
|
||
if !ok || !podReady(pod) {
|
||
continue
|
||
}
|
||
host := HostIfaceName(a.ContainerID)
|
||
via6 := net.ParseIP(a.IP6)
|
||
via4 := net.ParseIP(a.IP4)
|
||
for _, ipStr := range a.Anycast {
|
||
ip := net.ParseIP(ipStr)
|
||
if ip == nil {
|
||
continue
|
||
}
|
||
var via net.IP
|
||
if ip.To4() != nil {
|
||
via = via4
|
||
} else {
|
||
via = via6
|
||
}
|
||
if via == nil {
|
||
r.Logger.Warn("anycast skipped: pod has no unicast IP of same family",
|
||
"pod", a.Namespace+"/"+a.PodName, "anycast", ipStr)
|
||
continue
|
||
}
|
||
out[canonical(ip)] = anycastTarget{hostIface: host, via: via}
|
||
}
|
||
}
|
||
return out
|
||
}
|
||
|
||
func (r *AnycastReconciler) renderBird(desired map[string]anycastTarget) {
|
||
nc := r.NodeConfig.Load()
|
||
if nc == nil || r.Bird == nil {
|
||
return
|
||
}
|
||
var v6, v4 []string
|
||
for ipStr := range desired {
|
||
ip := net.ParseIP(ipStr)
|
||
if ip == nil {
|
||
continue
|
||
}
|
||
if ip.To4() != nil {
|
||
v4 = append(v4, ip.To4().String())
|
||
} else {
|
||
v6 = append(v6, ip.To16().String())
|
||
}
|
||
}
|
||
if err := r.Bird.Render(nc, v6, v4, r.RouterID); err != nil {
|
||
r.Logger.Warn("anycast bird render", "err", err)
|
||
}
|
||
}
|
||
|
||
// installAnycastRoute installs `<ipStr>/<128|32> via t.via dev t.hostIface`.
|
||
// Idempotent — RouteReplace overwrites a stale entry.
|
||
func installAnycastRoute(ipStr string, t anycastTarget) error {
|
||
ip := net.ParseIP(ipStr)
|
||
if ip == nil {
|
||
return fmt.Errorf("bad ip %q", ipStr)
|
||
}
|
||
link, err := netlink.LinkByName(t.hostIface)
|
||
if err != nil {
|
||
return fmt.Errorf("lookup %s: %w", t.hostIface, err)
|
||
}
|
||
prefix := 128
|
||
if ip.To4() != nil {
|
||
prefix = 32
|
||
ip = ip.To4()
|
||
}
|
||
r := &netlink.Route{
|
||
LinkIndex: link.Attrs().Index,
|
||
Dst: cidrFor(ip, prefix),
|
||
Gw: t.via,
|
||
// SCOPE_UNIVERSE — the gateway is on a different "logical" subnet
|
||
// than the local /128 route, but reachable on this veth. Linux is
|
||
// happy as long as the veth has IPv6 forwarding on (it does — set
|
||
// in configureHostSide) and the pod's eth0 has the via address
|
||
// (also true — that's the pod's IP6/IP4 we allocated).
|
||
}
|
||
return netlink.RouteReplace(r)
|
||
}
|
||
|
||
// removeAnycastRoute deletes the host route. Missing routes / interfaces
|
||
// are treated as success — DEL paths can race with veth teardown.
|
||
func removeAnycastRoute(ipStr string, t anycastTarget) error {
|
||
ip := net.ParseIP(ipStr)
|
||
if ip == nil {
|
||
return nil
|
||
}
|
||
link, err := netlink.LinkByName(t.hostIface)
|
||
if err != nil {
|
||
return nil
|
||
}
|
||
prefix := 128
|
||
if ip.To4() != nil {
|
||
prefix = 32
|
||
ip = ip.To4()
|
||
}
|
||
r := &netlink.Route{
|
||
LinkIndex: link.Attrs().Index,
|
||
Dst: cidrFor(ip, prefix),
|
||
Gw: t.via,
|
||
}
|
||
if err := netlink.RouteDel(r); err != nil {
|
||
// ESRCH ("no such process") is netlink-speak for "no such route";
|
||
// treat as success.
|
||
if errors.Is(err, syscall.ESRCH) || linkNotFound(err) {
|
||
return nil
|
||
}
|
||
return err
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// _ = flockv1alpha1 to silence unused import warnings on minimal builds.
|
||
var _ = flockv1alpha1.GroupName
|