Files
flock/pkg/agent/anycast_linux.go
Donavan Fritz a17d33e182
Build flock Image / build (push) Successful in 5m27s
agent: addresses annotation replaces IPAM allocation
When flock.fritzlab.net/addresses provides a v6 or v4, the IP becomes
the pod's primary IP for that family — bound to eth0, default route off
it, on-link host route via setHostRoute, and a per-pod /128 or /32 in
BGP. IPAM no longer allocates a private IP alongside it. The pod ends up
with exactly the operator-supplied addresses on eth0 (plus any extras
beyond the first-of-family, which keep the pre-existing layered
behavior).

This is the fix the original addresses-annotation work missed: bug #1
allocated a private IP next to the public one (so VPN-routed clients
could land on the private path on Plex). Promoting addresses-supplied
IPs into the IPAM-style routing slot keeps the public IP as the only
primary IP visible from outside.

Three pieces:
- annotations.go: reject pods whose addresses/anycast IP family is
  disabled (ipv6/ipv4 annotation or NodeConfig default). Both annotation
  types rely on the family being enabled for return-path routing.
- handlers.go: peel first v6 + first v4 from Addresses into res.IP6/IP4;
  suppress IPAM for those families; skip IPAM call entirely if both
  families are addresses-supplied.
- anycast_linux.go: extend renderBird to advertise any IPAM IP that's
  outside the node's BGP aggregate as a per-pod /32 or /128. This is
  what makes 142.202.202.166 reachable when host004's pod CIDR is
  172.25.214.0/24 — the addresses-promoted IP isn't covered by the
  aggregate.

Tests: 7 new annotation tests covering the conflict cases (ipv4=false +
addresses-v4, NodeConfig default + addresses-v4, etc.) plus 5 unit tests
for the splitAddressesPrimary helper.

README updated with the addresses-replaces-IPAM behavior, the
addresses-vs-anycast comparison, the conflict rule, and a Plex-style
example.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 09:46:48 -05:00

303 lines
8.2 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//go:build linux
package agent
import (
"context"
"errors"
"fmt"
"log/slog"
"net"
"sync"
"syscall"
"time"
flockv1alpha1 "code.fritzlab.net/fritzlab/flock/pkg/api/v1alpha1"
"github.com/vishvananda/netlink"
)
// AnycastReconciler keeps the kernel's anycast host routes and BIRD's
// advertised set in sync with (committed allocations × pod Ready).
//
// Lifecycle (per design doc):
// - CNI ADD assigns anycast IPs to pod lo (already done in netns_linux.go).
// - Pod transitions to Ready=True → install host /128 (or /32) route at
// `dev flock<8hex>` and add the IP to BIRD's export filter.
// - Pod transitions to Ready=False or DELETE → remove kernel route, remove
// from BIRD export.
//
// When more than one Ready pod on this node binds the same anycast IP, the
// kernel route uses RTA_MULTIPATH so the kernel does per-flow ECMP across
// the contributing pods. This is the within-node companion to BGP-level
// ECMP across nodes.
//
// Reconcile is idempotent. Triggers: AfterCommit hook, Pod informer
// UpdateFunc on Ready transitions, periodic 2s tick.
type AnycastReconciler struct {
Node string
Store *Store
Pods *PodCache
NodeConfig *NodeConfigCache
Bird *BirdManager
RouterID string
Logger *slog.Logger
mu sync.Mutex
advertised map[string]anycastTarget // canonical IP → install info
trigger chan struct{}
}
// NewAnycastReconciler returns a Reconciler ready to Run.
func NewAnycastReconciler(node string, store *Store, pods *PodCache, nc *NodeConfigCache, bird *BirdManager, routerID string, logger *slog.Logger) *AnycastReconciler {
return &AnycastReconciler{
Node: node,
Store: store,
Pods: pods,
NodeConfig: nc,
Bird: bird,
RouterID: routerID,
Logger: logger,
advertised: map[string]anycastTarget{},
trigger: make(chan struct{}, 1),
}
}
// Trigger requests one reconcile pass. Coalesces — if a pass is already
// pending, the call is a no-op.
func (r *AnycastReconciler) Trigger() {
select {
case r.trigger <- struct{}{}:
default:
}
}
// Run blocks until ctx is cancelled. Reconciles on Trigger or every 2s.
func (r *AnycastReconciler) Run(ctx context.Context) {
t := time.NewTicker(2 * time.Second)
defer t.Stop()
r.reconcile() // initial pass
for {
select {
case <-ctx.Done():
return
case <-t.C:
r.reconcile()
case <-r.trigger:
r.reconcile()
}
}
}
func (r *AnycastReconciler) reconcile() {
r.mu.Lock()
defer r.mu.Unlock()
desired := r.computeDesired()
// Install routes that should exist but don't, or whose nexthop set
// changed.
for ip, t := range desired {
if cur, ok := r.advertised[ip]; ok && cur.equal(t) {
continue
}
if err := installAnycastRoute(ip, t); err != nil {
r.Logger.Warn("anycast install", "ip", ip, "nexthops", len(t.nexthops), "err", err)
continue
}
r.Logger.Info("anycast advertise", "ip", ip, "nexthops", describeNexthops(t))
r.advertised[ip] = t
}
// Remove routes that exist but shouldn't.
for ip, t := range r.advertised {
if _, want := desired[ip]; !want {
if err := removeAnycastRoute(ip, t); err != nil {
r.Logger.Warn("anycast remove", "ip", ip, "err", err)
} else {
r.Logger.Info("anycast withdraw", "ip", ip)
}
delete(r.advertised, ip)
}
}
// Re-render BIRD with the active set.
r.renderBird(desired)
}
// computeDesired delegates to the pure resolveAnycastTargets and plugs in
// the live informer-based isReady callback.
func (r *AnycastReconciler) computeDesired() map[string]anycastTarget {
return resolveAnycastTargets(
r.Store.Snapshot(),
func(ns, name string) bool {
pod, ok := r.Pods.Get(ns, name)
return ok && podAnycastEligible(pod)
},
func(s string) { r.Logger.Warn(s) },
)
}
func (r *AnycastReconciler) renderBird(desired map[string]anycastTarget) {
nc := r.NodeConfig.Load()
if nc == nil || r.Bird == nil {
return
}
var v6, v4 []string
seen := map[string]struct{}{}
add := func(ip net.IP) {
key := canonical(ip)
if _, dup := seen[key]; dup {
return
}
seen[key] = struct{}{}
if ip.To4() != nil {
v4 = append(v4, ip.To4().String())
} else {
v6 = append(v6, ip.To16().String())
}
}
for ipStr := range desired {
if ip := net.ParseIP(ipStr); ip != nil {
add(ip)
}
}
// A pod IP that lives outside the node's BGP aggregate (e.g. an
// addresses-annotation IP promoted to be the pod's primary v4 — Plex's
// 142.202.202.166 against host004's 172.25.214.0/24) is not naturally
// covered by the aggregate, so it must be advertised individually as a
// /32 or /128. Anycast and addresses extras are already covered by the
// `desired` loop above; this sweep is for promoted-primary IPs which do
// not flow through the AnycastReconciler.
nodeV6, nodeV4 := parseNodeCIDRs(nc)
for _, a := range r.Store.Snapshot() {
if a.State != StateCommitted {
continue
}
if ip := net.ParseIP(a.IP6); ip != nil && !ipInAny(ip, nodeV6) {
add(ip)
}
if ip := net.ParseIP(a.IP4); ip != nil && !ipInAny(ip, nodeV4) {
add(ip)
}
}
if err := r.Bird.Render(nc, v6, v4, r.RouterID); err != nil {
r.Logger.Warn("anycast bird render", "err", err)
}
}
// parseNodeCIDRs parses NodeConfig.Spec.CIDR6/4 strings into IPNets,
// silently dropping malformed entries (admission-time validation should
// have rejected them long before this point).
func parseNodeCIDRs(nc *flockv1alpha1.NodeConfig) (v6, v4 []*net.IPNet) {
for _, s := range nc.Spec.CIDR6 {
if _, n, err := net.ParseCIDR(s); err == nil {
v6 = append(v6, n)
}
}
for _, s := range nc.Spec.CIDR4 {
if _, n, err := net.ParseCIDR(s); err == nil {
v4 = append(v4, n)
}
}
return
}
func ipInAny(ip net.IP, nets []*net.IPNet) bool {
for _, n := range nets {
if n.Contains(ip) {
return true
}
}
return false
}
// installAnycastRoute installs `<ipStr>/<128|32>` pointing at the
// nexthop set in t. With one nexthop the route is a plain via-route;
// with multiple, it's a multipath route using RTA_MULTIPATH so the
// kernel hashes flows across the constituent pods.
//
// Idempotent — RouteReplace overwrites a stale entry.
func installAnycastRoute(ipStr string, t anycastTarget) error {
ip := net.ParseIP(ipStr)
if ip == nil {
return fmt.Errorf("bad ip %q", ipStr)
}
if len(t.nexthops) == 0 {
return fmt.Errorf("anycast %s: no nexthops", ipStr)
}
prefix := 128
if ip.To4() != nil {
prefix = 32
ip = ip.To4()
}
r := &netlink.Route{Dst: cidrFor(ip, prefix)}
if len(t.nexthops) == 1 {
// Single nexthop — keep the route shape identical to today's
// production form. Functionally equivalent to a 1-element
// MultiPath but `ip route show` renders nicer for operators.
nh := t.nexthops[0]
link, err := netlink.LinkByName(nh.hostIface)
if err != nil {
return fmt.Errorf("lookup %s: %w", nh.hostIface, err)
}
r.LinkIndex = link.Attrs().Index
r.Gw = nh.via
} else {
hops := make([]*netlink.NexthopInfo, 0, len(t.nexthops))
for _, nh := range t.nexthops {
link, err := netlink.LinkByName(nh.hostIface)
if err != nil {
return fmt.Errorf("lookup %s: %w", nh.hostIface, err)
}
hops = append(hops, &netlink.NexthopInfo{
LinkIndex: link.Attrs().Index,
Gw: nh.via,
Hops: 0,
})
}
r.MultiPath = hops
}
return netlink.RouteReplace(r)
}
// removeAnycastRoute deletes the host route. Missing routes / interfaces
// are treated as success — DEL paths can race with veth teardown.
//
// Kernel route deletion matches by destination prefix; we don't need to
// re-specify the nexthop set.
func removeAnycastRoute(ipStr string, _ anycastTarget) error {
ip := net.ParseIP(ipStr)
if ip == nil {
return nil
}
prefix := 128
if ip.To4() != nil {
prefix = 32
ip = ip.To4()
}
r := &netlink.Route{Dst: cidrFor(ip, prefix)}
if err := netlink.RouteDel(r); err != nil {
// ESRCH ("no such process") is netlink-speak for "no such route";
// treat as success.
if errors.Is(err, syscall.ESRCH) || linkNotFound(err) {
return nil
}
return err
}
return nil
}
// describeNexthops returns a compact string for log messages.
func describeNexthops(t anycastTarget) string {
var s string
for i, nh := range t.nexthops {
if i > 0 {
s += ","
}
s += nh.hostIface + "→" + nh.via.String()
}
return s
}
// _ = flockv1alpha1 to silence unused import warnings on minimal builds.
var _ = flockv1alpha1.GroupName