anycast: kernel multipath route + L4 hash for multi-pod-per-node
Build flock Image / build (push) Has been cancelled
Build flock Image / build (push) Has been cancelled
Move pure resolver logic out of anycast_linux.go into anycast.go so it's
unit-testable on any host. Reshape anycastTarget from a single
{hostIface, via} into a sorted list of nexthops; multiple Ready pods on
the same node binding the same anycast IP now contribute one nexthop
each.
installAnycastRoute uses RTA_MULTIPATH (via netlink.Route.MultiPath)
when the target has more than one nexthop. Single-nexthop targets keep
the simple via-route shape so 1-pod-per-node keeps rendering identically
to today's production form in `ip route show`.
flock-agent writes net.ipv{4,6}.fib_multipath_hash_policy = 1 at
startup so the kernel hashes flows on (saddr, daddr, sport, dport, proto)
rather than just IPs. Best-effort — runs privileged in production, so
it works; falls back to L3 hash on environments where the write fails
(only matters for the multi-pod-per-node case anyway).
resolveAnycastTargets sorts nexthops by canonical(via) for stable
comparison so a quiet reconcile pass doesn't churn the kernel route.
8 new unit tests cover: 1-pod, 2-pods-same-anycast (multi-nexthop),
NotReady drop, no-Ready omits the IP, pending skipped, mixed v6+v4,
family mismatch warns, determinism.
Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,110 @@
|
|||||||
|
package agent
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net"
|
||||||
|
"sort"
|
||||||
|
)
|
||||||
|
|
||||||
|
// anycastNexthop is one (host-side veth, pod-eth0-IP) pair the kernel route
|
||||||
|
// can use as a multipath nexthop.
|
||||||
|
type anycastNexthop struct {
|
||||||
|
hostIface string
|
||||||
|
via net.IP
|
||||||
|
}
|
||||||
|
|
||||||
|
// anycastTarget describes the kernel route shape for one advertised anycast
|
||||||
|
// IP. When more than one Ready pod on this node binds the same anycast IP,
|
||||||
|
// every Ready pod contributes a nexthop and the kernel does per-flow ECMP
|
||||||
|
// across them.
|
||||||
|
//
|
||||||
|
// nexthops is sorted by canonical(via) for deterministic comparison and
|
||||||
|
// stable kernel-route ordering across reconcile passes — the
|
||||||
|
// AnycastReconciler skips kernel writes when the new and old targets are
|
||||||
|
// equal, which only works if the slice order is stable.
|
||||||
|
type anycastTarget struct {
|
||||||
|
nexthops []anycastNexthop
|
||||||
|
}
|
||||||
|
|
||||||
|
// equal reports whether two targets describe the same kernel route.
|
||||||
|
// Both sides are expected to be sorted (the canonical constructor sorts).
|
||||||
|
func (t anycastTarget) equal(o anycastTarget) bool {
|
||||||
|
if len(t.nexthops) != len(o.nexthops) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for i := range t.nexthops {
|
||||||
|
if t.nexthops[i].hostIface != o.nexthops[i].hostIface {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if !t.nexthops[i].via.Equal(o.nexthops[i].via) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// resolveAnycastTargets walks the committed allocation set and returns the
|
||||||
|
// desired kernel-route shape for every anycast IP that has at least one
|
||||||
|
// Ready local pod binding it. Multiple Ready pods sharing the same anycast
|
||||||
|
// IP collapse into a single multi-nexthop target so the kernel can
|
||||||
|
// per-flow ECMP across them.
|
||||||
|
//
|
||||||
|
// Pure: no kernel calls, no informer access. Pods are surfaced via the
|
||||||
|
// isReady callback so the reconciler can plug in its informer; tests can
|
||||||
|
// pass any function that satisfies the signature.
|
||||||
|
//
|
||||||
|
// warn is invoked for human-facing skip reasons (e.g. anycast with no
|
||||||
|
// unicast of same family). nil-safe — pass nil to silently drop.
|
||||||
|
func resolveAnycastTargets(
|
||||||
|
allocations []Allocation,
|
||||||
|
isReady func(namespace, name string) bool,
|
||||||
|
warn func(string),
|
||||||
|
) map[string]anycastTarget {
|
||||||
|
if warn == nil {
|
||||||
|
warn = func(string) {}
|
||||||
|
}
|
||||||
|
out := map[string]anycastTarget{}
|
||||||
|
for _, a := range allocations {
|
||||||
|
if a.State != StateCommitted || len(a.Anycast) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !isReady(a.Namespace, a.PodName) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
host := HostIfaceName(a.ContainerID)
|
||||||
|
via6 := net.ParseIP(a.IP6)
|
||||||
|
via4 := net.ParseIP(a.IP4)
|
||||||
|
for _, ipStr := range a.Anycast {
|
||||||
|
ip := net.ParseIP(ipStr)
|
||||||
|
if ip == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
var via net.IP
|
||||||
|
if ip.To4() != nil {
|
||||||
|
via = via4
|
||||||
|
} else {
|
||||||
|
via = via6
|
||||||
|
}
|
||||||
|
if via == nil {
|
||||||
|
warn("anycast " + ipStr + " skipped: pod " +
|
||||||
|
a.Namespace + "/" + a.PodName +
|
||||||
|
" has no unicast of same family")
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
key := canonical(ip)
|
||||||
|
t := out[key]
|
||||||
|
t.nexthops = append(t.nexthops, anycastNexthop{hostIface: host, via: via})
|
||||||
|
out[key] = t
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Sort each target's nexthops for stable comparison + stable kernel
|
||||||
|
// ordering. Sort key is canonical(via) — sufficient for stability
|
||||||
|
// because (host, via) pairs are 1:1 (one veth per pod, one v6+v4 per
|
||||||
|
// pod, so via uniquely identifies the nexthop).
|
||||||
|
for k, t := range out {
|
||||||
|
sort.Slice(t.nexthops, func(i, j int) bool {
|
||||||
|
return canonical(t.nexthops[i].via) < canonical(t.nexthops[j].via)
|
||||||
|
})
|
||||||
|
out[k] = t
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
+72
-73
@@ -26,6 +26,11 @@ import (
|
|||||||
// - Pod transitions to Ready=False or DELETE → remove kernel route, remove
|
// - Pod transitions to Ready=False or DELETE → remove kernel route, remove
|
||||||
// from BIRD export.
|
// from BIRD export.
|
||||||
//
|
//
|
||||||
|
// When more than one Ready pod on this node binds the same anycast IP, the
|
||||||
|
// kernel route uses RTA_MULTIPATH so the kernel does per-flow ECMP across
|
||||||
|
// the contributing pods. This is the within-node companion to BGP-level
|
||||||
|
// ECMP across nodes.
|
||||||
|
//
|
||||||
// Reconcile is idempotent. Triggers: AfterCommit hook, Pod informer
|
// Reconcile is idempotent. Triggers: AfterCommit hook, Pod informer
|
||||||
// UpdateFunc on Ready transitions, periodic 2s tick.
|
// UpdateFunc on Ready transitions, periodic 2s tick.
|
||||||
type AnycastReconciler struct {
|
type AnycastReconciler struct {
|
||||||
@@ -42,13 +47,6 @@ type AnycastReconciler struct {
|
|||||||
trigger chan struct{}
|
trigger chan struct{}
|
||||||
}
|
}
|
||||||
|
|
||||||
// anycastTarget describes the kernel route shape for one advertised
|
|
||||||
// anycast IP: which veth, and which pod eth0 IP to use as next-hop.
|
|
||||||
type anycastTarget struct {
|
|
||||||
hostIface string
|
|
||||||
via net.IP
|
|
||||||
}
|
|
||||||
|
|
||||||
// NewAnycastReconciler returns a Reconciler ready to Run.
|
// NewAnycastReconciler returns a Reconciler ready to Run.
|
||||||
func NewAnycastReconciler(node string, store *Store, pods *PodCache, nc *NodeConfigCache, bird *BirdManager, routerID string, logger *slog.Logger) *AnycastReconciler {
|
func NewAnycastReconciler(node string, store *Store, pods *PodCache, nc *NodeConfigCache, bird *BirdManager, routerID string, logger *slog.Logger) *AnycastReconciler {
|
||||||
return &AnycastReconciler{
|
return &AnycastReconciler{
|
||||||
@@ -96,25 +94,26 @@ func (r *AnycastReconciler) reconcile() {
|
|||||||
|
|
||||||
desired := r.computeDesired()
|
desired := r.computeDesired()
|
||||||
|
|
||||||
// Install routes that should exist but don't (or whose target changed).
|
// Install routes that should exist but don't, or whose nexthop set
|
||||||
|
// changed.
|
||||||
for ip, t := range desired {
|
for ip, t := range desired {
|
||||||
if cur, ok := r.advertised[ip]; ok && cur.hostIface == t.hostIface && cur.via.Equal(t.via) {
|
if cur, ok := r.advertised[ip]; ok && cur.equal(t) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if err := installAnycastRoute(ip, t); err != nil {
|
if err := installAnycastRoute(ip, t); err != nil {
|
||||||
r.Logger.Warn("anycast install", "ip", ip, "host", t.hostIface, "via", t.via, "err", err)
|
r.Logger.Warn("anycast install", "ip", ip, "nexthops", len(t.nexthops), "err", err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
r.Logger.Info("anycast advertise", "ip", ip, "host", t.hostIface, "via", t.via)
|
r.Logger.Info("anycast advertise", "ip", ip, "nexthops", describeNexthops(t))
|
||||||
r.advertised[ip] = t
|
r.advertised[ip] = t
|
||||||
}
|
}
|
||||||
// Remove routes that exist but shouldn't.
|
// Remove routes that exist but shouldn't.
|
||||||
for ip, t := range r.advertised {
|
for ip, t := range r.advertised {
|
||||||
if _, want := desired[ip]; !want {
|
if _, want := desired[ip]; !want {
|
||||||
if err := removeAnycastRoute(ip, t); err != nil {
|
if err := removeAnycastRoute(ip, t); err != nil {
|
||||||
r.Logger.Warn("anycast remove", "ip", ip, "host", t.hostIface, "err", err)
|
r.Logger.Warn("anycast remove", "ip", ip, "err", err)
|
||||||
} else {
|
} else {
|
||||||
r.Logger.Info("anycast withdraw", "ip", ip, "host", t.hostIface)
|
r.Logger.Info("anycast withdraw", "ip", ip)
|
||||||
}
|
}
|
||||||
delete(r.advertised, ip)
|
delete(r.advertised, ip)
|
||||||
}
|
}
|
||||||
@@ -124,44 +123,17 @@ func (r *AnycastReconciler) reconcile() {
|
|||||||
r.renderBird(desired)
|
r.renderBird(desired)
|
||||||
}
|
}
|
||||||
|
|
||||||
// computeDesired walks the Store and returns the per-ip anycastTarget for
|
// computeDesired delegates to the pure resolveAnycastTargets and plugs in
|
||||||
// every anycast advertisement that should be active right now. Each target
|
// the live informer-based isReady callback.
|
||||||
// uses the pod's own eth0 IP (same family) as the route's `via` next-hop —
|
|
||||||
// that way kernel NDP/ARP resolves the eth0 address, which IS configured
|
|
||||||
// on the pod's eth0, so the pod responds normally without proxy_ndp.
|
|
||||||
func (r *AnycastReconciler) computeDesired() map[string]anycastTarget {
|
func (r *AnycastReconciler) computeDesired() map[string]anycastTarget {
|
||||||
out := map[string]anycastTarget{}
|
return resolveAnycastTargets(
|
||||||
for _, a := range r.Store.Snapshot() {
|
r.Store.Snapshot(),
|
||||||
if a.State != StateCommitted || len(a.Anycast) == 0 {
|
func(ns, name string) bool {
|
||||||
continue
|
pod, ok := r.Pods.Get(ns, name)
|
||||||
}
|
return ok && podReady(pod)
|
||||||
pod, ok := r.Pods.Get(a.Namespace, a.PodName)
|
},
|
||||||
if !ok || !podReady(pod) {
|
func(s string) { r.Logger.Warn(s) },
|
||||||
continue
|
)
|
||||||
}
|
|
||||||
host := HostIfaceName(a.ContainerID)
|
|
||||||
via6 := net.ParseIP(a.IP6)
|
|
||||||
via4 := net.ParseIP(a.IP4)
|
|
||||||
for _, ipStr := range a.Anycast {
|
|
||||||
ip := net.ParseIP(ipStr)
|
|
||||||
if ip == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
var via net.IP
|
|
||||||
if ip.To4() != nil {
|
|
||||||
via = via4
|
|
||||||
} else {
|
|
||||||
via = via6
|
|
||||||
}
|
|
||||||
if via == nil {
|
|
||||||
r.Logger.Warn("anycast skipped: pod has no unicast IP of same family",
|
|
||||||
"pod", a.Namespace+"/"+a.PodName, "anycast", ipStr)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
out[canonical(ip)] = anycastTarget{hostIface: host, via: via}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return out
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *AnycastReconciler) renderBird(desired map[string]anycastTarget) {
|
func (r *AnycastReconciler) renderBird(desired map[string]anycastTarget) {
|
||||||
@@ -186,56 +158,71 @@ func (r *AnycastReconciler) renderBird(desired map[string]anycastTarget) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// installAnycastRoute installs `<ipStr>/<128|32> via t.via dev t.hostIface`.
|
// installAnycastRoute installs `<ipStr>/<128|32>` pointing at the
|
||||||
|
// nexthop set in t. With one nexthop the route is a plain via-route;
|
||||||
|
// with multiple, it's a multipath route using RTA_MULTIPATH so the
|
||||||
|
// kernel hashes flows across the constituent pods.
|
||||||
|
//
|
||||||
// Idempotent — RouteReplace overwrites a stale entry.
|
// Idempotent — RouteReplace overwrites a stale entry.
|
||||||
func installAnycastRoute(ipStr string, t anycastTarget) error {
|
func installAnycastRoute(ipStr string, t anycastTarget) error {
|
||||||
ip := net.ParseIP(ipStr)
|
ip := net.ParseIP(ipStr)
|
||||||
if ip == nil {
|
if ip == nil {
|
||||||
return fmt.Errorf("bad ip %q", ipStr)
|
return fmt.Errorf("bad ip %q", ipStr)
|
||||||
}
|
}
|
||||||
link, err := netlink.LinkByName(t.hostIface)
|
if len(t.nexthops) == 0 {
|
||||||
if err != nil {
|
return fmt.Errorf("anycast %s: no nexthops", ipStr)
|
||||||
return fmt.Errorf("lookup %s: %w", t.hostIface, err)
|
|
||||||
}
|
}
|
||||||
prefix := 128
|
prefix := 128
|
||||||
if ip.To4() != nil {
|
if ip.To4() != nil {
|
||||||
prefix = 32
|
prefix = 32
|
||||||
ip = ip.To4()
|
ip = ip.To4()
|
||||||
}
|
}
|
||||||
r := &netlink.Route{
|
r := &netlink.Route{Dst: cidrFor(ip, prefix)}
|
||||||
LinkIndex: link.Attrs().Index,
|
if len(t.nexthops) == 1 {
|
||||||
Dst: cidrFor(ip, prefix),
|
// Single nexthop — keep the route shape identical to today's
|
||||||
Gw: t.via,
|
// production form. Functionally equivalent to a 1-element
|
||||||
// SCOPE_UNIVERSE — the gateway is on a different "logical" subnet
|
// MultiPath but `ip route show` renders nicer for operators.
|
||||||
// than the local /128 route, but reachable on this veth. Linux is
|
nh := t.nexthops[0]
|
||||||
// happy as long as the veth has IPv6 forwarding on (it does — set
|
link, err := netlink.LinkByName(nh.hostIface)
|
||||||
// in configureHostSide) and the pod's eth0 has the via address
|
if err != nil {
|
||||||
// (also true — that's the pod's IP6/IP4 we allocated).
|
return fmt.Errorf("lookup %s: %w", nh.hostIface, err)
|
||||||
|
}
|
||||||
|
r.LinkIndex = link.Attrs().Index
|
||||||
|
r.Gw = nh.via
|
||||||
|
} else {
|
||||||
|
hops := make([]*netlink.NexthopInfo, 0, len(t.nexthops))
|
||||||
|
for _, nh := range t.nexthops {
|
||||||
|
link, err := netlink.LinkByName(nh.hostIface)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("lookup %s: %w", nh.hostIface, err)
|
||||||
|
}
|
||||||
|
hops = append(hops, &netlink.NexthopInfo{
|
||||||
|
LinkIndex: link.Attrs().Index,
|
||||||
|
Gw: nh.via,
|
||||||
|
Hops: 0,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
r.MultiPath = hops
|
||||||
}
|
}
|
||||||
return netlink.RouteReplace(r)
|
return netlink.RouteReplace(r)
|
||||||
}
|
}
|
||||||
|
|
||||||
// removeAnycastRoute deletes the host route. Missing routes / interfaces
|
// removeAnycastRoute deletes the host route. Missing routes / interfaces
|
||||||
// are treated as success — DEL paths can race with veth teardown.
|
// are treated as success — DEL paths can race with veth teardown.
|
||||||
func removeAnycastRoute(ipStr string, t anycastTarget) error {
|
//
|
||||||
|
// Kernel route deletion matches by destination prefix; we don't need to
|
||||||
|
// re-specify the nexthop set.
|
||||||
|
func removeAnycastRoute(ipStr string, _ anycastTarget) error {
|
||||||
ip := net.ParseIP(ipStr)
|
ip := net.ParseIP(ipStr)
|
||||||
if ip == nil {
|
if ip == nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
link, err := netlink.LinkByName(t.hostIface)
|
|
||||||
if err != nil {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
prefix := 128
|
prefix := 128
|
||||||
if ip.To4() != nil {
|
if ip.To4() != nil {
|
||||||
prefix = 32
|
prefix = 32
|
||||||
ip = ip.To4()
|
ip = ip.To4()
|
||||||
}
|
}
|
||||||
r := &netlink.Route{
|
r := &netlink.Route{Dst: cidrFor(ip, prefix)}
|
||||||
LinkIndex: link.Attrs().Index,
|
|
||||||
Dst: cidrFor(ip, prefix),
|
|
||||||
Gw: t.via,
|
|
||||||
}
|
|
||||||
if err := netlink.RouteDel(r); err != nil {
|
if err := netlink.RouteDel(r); err != nil {
|
||||||
// ESRCH ("no such process") is netlink-speak for "no such route";
|
// ESRCH ("no such process") is netlink-speak for "no such route";
|
||||||
// treat as success.
|
// treat as success.
|
||||||
@@ -247,5 +234,17 @@ func removeAnycastRoute(ipStr string, t anycastTarget) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// describeNexthops returns a compact string for log messages.
|
||||||
|
func describeNexthops(t anycastTarget) string {
|
||||||
|
var s string
|
||||||
|
for i, nh := range t.nexthops {
|
||||||
|
if i > 0 {
|
||||||
|
s += ","
|
||||||
|
}
|
||||||
|
s += nh.hostIface + "→" + nh.via.String()
|
||||||
|
}
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
// _ = flockv1alpha1 to silence unused import warnings on minimal builds.
|
// _ = flockv1alpha1 to silence unused import warnings on minimal builds.
|
||||||
var _ = flockv1alpha1.GroupName
|
var _ = flockv1alpha1.GroupName
|
||||||
|
|||||||
@@ -0,0 +1,227 @@
|
|||||||
|
package agent
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
// allReady is a convenience isReady that says yes to every pod.
|
||||||
|
func allReady(_, _ string) bool { return true }
|
||||||
|
|
||||||
|
// readyOnly returns an isReady that only says yes to the named pods.
|
||||||
|
func readyOnly(want ...string) func(string, string) bool {
|
||||||
|
set := map[string]struct{}{}
|
||||||
|
for _, n := range want {
|
||||||
|
set[n] = struct{}{}
|
||||||
|
}
|
||||||
|
return func(_, name string) bool {
|
||||||
|
_, ok := set[name]
|
||||||
|
return ok
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResolveAnycastTargets_OnePodOneAnycast(t *testing.T) {
|
||||||
|
allocs := []Allocation{{
|
||||||
|
ContainerID: "c1", Namespace: "ns", PodName: "pod-a",
|
||||||
|
State: StateCommitted,
|
||||||
|
IP6: "2001:db8::1",
|
||||||
|
Anycast: []string{"2001:db8:a::1"},
|
||||||
|
}}
|
||||||
|
out := resolveAnycastTargets(allocs, allReady, nil)
|
||||||
|
if len(out) != 1 {
|
||||||
|
t.Fatalf("expected 1 anycast IP, got %d", len(out))
|
||||||
|
}
|
||||||
|
tgt, ok := out["2001:db8:a::1"]
|
||||||
|
if !ok {
|
||||||
|
t.Fatalf("missing target")
|
||||||
|
}
|
||||||
|
if len(tgt.nexthops) != 1 {
|
||||||
|
t.Fatalf("expected 1 nexthop, got %d", len(tgt.nexthops))
|
||||||
|
}
|
||||||
|
if !tgt.nexthops[0].via.Equal(net.ParseIP("2001:db8::1")) {
|
||||||
|
t.Fatalf("nexthop via wrong: %v", tgt.nexthops[0].via)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Two pods on the same node binding the same anycast IP must produce a
|
||||||
|
// SINGLE target with TWO nexthops. The previous behaviour (overwriting)
|
||||||
|
// was the bug this whole change exists to fix.
|
||||||
|
func TestResolveAnycastTargets_TwoPodsSameAnycast_MultiNexthop(t *testing.T) {
|
||||||
|
allocs := []Allocation{
|
||||||
|
{ContainerID: "c1", Namespace: "ns", PodName: "pod-a",
|
||||||
|
State: StateCommitted, IP6: "2001:db8::2",
|
||||||
|
Anycast: []string{"2001:db8:a::1"}},
|
||||||
|
{ContainerID: "c2", Namespace: "ns", PodName: "pod-b",
|
||||||
|
State: StateCommitted, IP6: "2001:db8::1",
|
||||||
|
Anycast: []string{"2001:db8:a::1"}},
|
||||||
|
}
|
||||||
|
out := resolveAnycastTargets(allocs, allReady, nil)
|
||||||
|
tgt := out["2001:db8:a::1"]
|
||||||
|
if len(tgt.nexthops) != 2 {
|
||||||
|
t.Fatalf("expected 2 nexthops, got %d", len(tgt.nexthops))
|
||||||
|
}
|
||||||
|
// Order should be sorted by canonical(via) — ::1 before ::2.
|
||||||
|
if !tgt.nexthops[0].via.Equal(net.ParseIP("2001:db8::1")) {
|
||||||
|
t.Fatalf("nexthops not sorted by via; got %v first", tgt.nexthops[0].via)
|
||||||
|
}
|
||||||
|
if !tgt.nexthops[1].via.Equal(net.ParseIP("2001:db8::2")) {
|
||||||
|
t.Fatalf("nexthops not sorted by via; got %v second", tgt.nexthops[1].via)
|
||||||
|
}
|
||||||
|
// HostIface differs per pod (different containerID → different FNV).
|
||||||
|
if tgt.nexthops[0].hostIface == tgt.nexthops[1].hostIface {
|
||||||
|
t.Fatalf("expected distinct hostIfaces, both %q", tgt.nexthops[0].hostIface)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// When one of the contributing pods goes NotReady, only the remaining
|
||||||
|
// Ready pod should appear in the target's nexthop set.
|
||||||
|
func TestResolveAnycastTargets_NotReadyDropped(t *testing.T) {
|
||||||
|
allocs := []Allocation{
|
||||||
|
{ContainerID: "c1", Namespace: "ns", PodName: "pod-a",
|
||||||
|
State: StateCommitted, IP6: "2001:db8::1",
|
||||||
|
Anycast: []string{"2001:db8:a::1"}},
|
||||||
|
{ContainerID: "c2", Namespace: "ns", PodName: "pod-b",
|
||||||
|
State: StateCommitted, IP6: "2001:db8::2",
|
||||||
|
Anycast: []string{"2001:db8:a::1"}},
|
||||||
|
}
|
||||||
|
out := resolveAnycastTargets(allocs, readyOnly("pod-a"), nil)
|
||||||
|
tgt := out["2001:db8:a::1"]
|
||||||
|
if len(tgt.nexthops) != 1 {
|
||||||
|
t.Fatalf("expected 1 nexthop after NotReady drop, got %d", len(tgt.nexthops))
|
||||||
|
}
|
||||||
|
if !tgt.nexthops[0].via.Equal(net.ParseIP("2001:db8::1")) {
|
||||||
|
t.Fatalf("wrong surviving nexthop: %v", tgt.nexthops[0].via)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pods that haven't reached Ready are excluded entirely from the target
|
||||||
|
// set. If no pod is Ready for an anycast IP, that IP is absent from the
|
||||||
|
// output (BIRD will withdraw from BGP, kernel route will be removed).
|
||||||
|
func TestResolveAnycastTargets_NoReadyPodsOmitsIP(t *testing.T) {
|
||||||
|
allocs := []Allocation{
|
||||||
|
{ContainerID: "c1", Namespace: "ns", PodName: "pod-a",
|
||||||
|
State: StateCommitted, IP6: "2001:db8::1",
|
||||||
|
Anycast: []string{"2001:db8:a::1"}},
|
||||||
|
}
|
||||||
|
out := resolveAnycastTargets(allocs, readyOnly( /* none */ ), nil)
|
||||||
|
if _, ok := out["2001:db8:a::1"]; ok {
|
||||||
|
t.Fatalf("anycast should be absent when no pod ready")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pending allocations (CNI ADD partway through) are skipped even if the
|
||||||
|
// pod is Ready — we don't program kernel routes for partial setups.
|
||||||
|
func TestResolveAnycastTargets_PendingSkipped(t *testing.T) {
|
||||||
|
allocs := []Allocation{
|
||||||
|
{ContainerID: "c1", Namespace: "ns", PodName: "pod-a",
|
||||||
|
State: StatePending, IP6: "2001:db8::1",
|
||||||
|
Anycast: []string{"2001:db8:a::1"}},
|
||||||
|
}
|
||||||
|
out := resolveAnycastTargets(allocs, allReady, nil)
|
||||||
|
if len(out) != 0 {
|
||||||
|
t.Fatalf("pending allocations must be skipped")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mixed v6+v4 anycast on the same pod produces two separate target
|
||||||
|
// entries, one per family, each anchored on the matching unicast IP.
|
||||||
|
func TestResolveAnycastTargets_MixedFamilies(t *testing.T) {
|
||||||
|
allocs := []Allocation{{
|
||||||
|
ContainerID: "c1", Namespace: "ns", PodName: "pod-a",
|
||||||
|
State: StateCommitted,
|
||||||
|
IP6: "2001:db8::1",
|
||||||
|
IP4: "10.0.0.1",
|
||||||
|
Anycast: []string{"2001:db8:a::1", "10.255.0.1"},
|
||||||
|
}}
|
||||||
|
out := resolveAnycastTargets(allocs, allReady, nil)
|
||||||
|
if !out["2001:db8:a::1"].nexthops[0].via.Equal(net.ParseIP("2001:db8::1")) {
|
||||||
|
t.Fatalf("v6 anycast should resolve via v6 unicast")
|
||||||
|
}
|
||||||
|
if !out["10.255.0.1"].nexthops[0].via.Equal(net.ParseIP("10.0.0.1").To4()) {
|
||||||
|
t.Fatalf("v4 anycast should resolve via v4 unicast")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// An anycast whose family has no matching unicast on the pod is skipped
|
||||||
|
// with a warning. Other anycast IPs on the same pod are unaffected.
|
||||||
|
func TestResolveAnycastTargets_FamilyMismatchWarns(t *testing.T) {
|
||||||
|
allocs := []Allocation{{
|
||||||
|
ContainerID: "c1", Namespace: "ns", PodName: "pod-a",
|
||||||
|
State: StateCommitted,
|
||||||
|
IP6: "2001:db8::1", // v6 only
|
||||||
|
Anycast: []string{"2001:db8:a::1", "10.255.0.1"},
|
||||||
|
}}
|
||||||
|
var warns []string
|
||||||
|
out := resolveAnycastTargets(allocs, allReady, func(s string) { warns = append(warns, s) })
|
||||||
|
if _, has := out["2001:db8:a::1"]; !has {
|
||||||
|
t.Fatalf("v6 anycast should have been programmed")
|
||||||
|
}
|
||||||
|
if _, has := out["10.255.0.1"]; has {
|
||||||
|
t.Fatalf("v4 anycast should have been skipped")
|
||||||
|
}
|
||||||
|
if len(warns) != 1 {
|
||||||
|
t.Fatalf("expected 1 warning, got %d: %v", len(warns), warns)
|
||||||
|
}
|
||||||
|
if !strings.Contains(warns[0], "10.255.0.1") {
|
||||||
|
t.Fatalf("warning should mention skipped IP: %q", warns[0])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Determinism: the same input must produce nexthops in the same order.
|
||||||
|
func TestResolveAnycastTargets_Determinism(t *testing.T) {
|
||||||
|
allocs := []Allocation{
|
||||||
|
{ContainerID: "z-late", Namespace: "ns", PodName: "z",
|
||||||
|
State: StateCommitted, IP6: "2001:db8::5",
|
||||||
|
Anycast: []string{"2001:db8:a::1"}},
|
||||||
|
{ContainerID: "a-early", Namespace: "ns", PodName: "a",
|
||||||
|
State: StateCommitted, IP6: "2001:db8::3",
|
||||||
|
Anycast: []string{"2001:db8:a::1"}},
|
||||||
|
{ContainerID: "m-mid", Namespace: "ns", PodName: "m",
|
||||||
|
State: StateCommitted, IP6: "2001:db8::4",
|
||||||
|
Anycast: []string{"2001:db8:a::1"}},
|
||||||
|
}
|
||||||
|
a := resolveAnycastTargets(allocs, allReady, nil)
|
||||||
|
b := resolveAnycastTargets(allocs, allReady, nil)
|
||||||
|
if !a["2001:db8:a::1"].equal(b["2001:db8:a::1"]) {
|
||||||
|
t.Fatalf("same input produced unequal targets")
|
||||||
|
}
|
||||||
|
// Sorted by canonical(via): ::3, ::4, ::5
|
||||||
|
via := a["2001:db8:a::1"].nexthops
|
||||||
|
if !via[0].via.Equal(net.ParseIP("2001:db8::3")) ||
|
||||||
|
!via[1].via.Equal(net.ParseIP("2001:db8::4")) ||
|
||||||
|
!via[2].via.Equal(net.ParseIP("2001:db8::5")) {
|
||||||
|
t.Fatalf("nexthops not stably sorted: %v %v %v", via[0].via, via[1].via, via[2].via)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// equal()'s contract — different orderings are still considered equal
|
||||||
|
// AS LONG AS both sides have been canonicalised by resolveAnycastTargets.
|
||||||
|
// Across-call comparisons of resolver outputs must always match for the
|
||||||
|
// same logical input.
|
||||||
|
func TestAnycastTarget_Equal(t *testing.T) {
|
||||||
|
a := anycastTarget{nexthops: []anycastNexthop{
|
||||||
|
{hostIface: "f1", via: net.ParseIP("2001:db8::1")},
|
||||||
|
{hostIface: "f2", via: net.ParseIP("2001:db8::2")},
|
||||||
|
}}
|
||||||
|
b := anycastTarget{nexthops: []anycastNexthop{
|
||||||
|
{hostIface: "f1", via: net.ParseIP("2001:db8::1")},
|
||||||
|
{hostIface: "f2", via: net.ParseIP("2001:db8::2")},
|
||||||
|
}}
|
||||||
|
if !a.equal(b) {
|
||||||
|
t.Fatalf("equal targets reported unequal")
|
||||||
|
}
|
||||||
|
c := anycastTarget{nexthops: []anycastNexthop{
|
||||||
|
{hostIface: "f1", via: net.ParseIP("2001:db8::1")},
|
||||||
|
}}
|
||||||
|
if a.equal(c) {
|
||||||
|
t.Fatalf("targets with different lengths reported equal")
|
||||||
|
}
|
||||||
|
d := anycastTarget{nexthops: []anycastNexthop{
|
||||||
|
{hostIface: "f1", via: net.ParseIP("2001:db8::1")},
|
||||||
|
{hostIface: "f2", via: net.ParseIP("2001:db8::3")}, // diff IP
|
||||||
|
}}
|
||||||
|
if a.equal(d) {
|
||||||
|
t.Fatalf("targets with different vias reported equal")
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -6,11 +6,36 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"net"
|
"net"
|
||||||
|
"os"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"code.fritzlab.net/fritzlab/flock/pkg/agent/netpol"
|
"code.fritzlab.net/fritzlab/flock/pkg/agent/netpol"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// hostMultipathHashSysctls is the set of node-level sysctls flock-agent
|
||||||
|
// best-effort writes at startup. Default policy 0 hashes only on
|
||||||
|
// (saddr, daddr); policy 1 adds L4 (sport, dport, proto), giving real
|
||||||
|
// per-connection ECMP across multipath nexthops — required for sensible
|
||||||
|
// distribution across multiple anycast pods on the same node.
|
||||||
|
var hostMultipathHashSysctls = map[string]string{
|
||||||
|
"/proc/sys/net/ipv4/fib_multipath_hash_policy": "1",
|
||||||
|
"/proc/sys/net/ipv6/fib_multipath_hash_policy": "1",
|
||||||
|
}
|
||||||
|
|
||||||
|
// applyHostSysctls writes the sysctls in m, logging but not failing on
|
||||||
|
// errors. flock-agent is privileged so this works in the production
|
||||||
|
// DaemonSet; in environments where it doesn't, single-pod-per-node
|
||||||
|
// anycast still works (this only affects the multi-pod-per-node case).
|
||||||
|
func applyHostSysctls(s *Server) {
|
||||||
|
for path, value := range hostMultipathHashSysctls {
|
||||||
|
if err := os.WriteFile(path, []byte(value), 0o644); err != nil {
|
||||||
|
s.Logger.Warn("set host sysctl", "path", path, "value", value, "err", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
s.Logger.Info("host sysctl set", "path", path, "value", value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// configureRuntime wires Pod informer, IPAM, netlink, and BIRD on a real
|
// configureRuntime wires Pod informer, IPAM, netlink, and BIRD on a real
|
||||||
// Linux node. Steps:
|
// Linux node. Steps:
|
||||||
//
|
//
|
||||||
@@ -23,6 +48,8 @@ import (
|
|||||||
// 5. Build PodHandler and SetHandlers(add, del, check).
|
// 5. Build PodHandler and SetHandlers(add, del, check).
|
||||||
// 6. Install BIRD blackhole summary routes + render initial config.
|
// 6. Install BIRD blackhole summary routes + render initial config.
|
||||||
func (s *Server) configureRuntime(ctx context.Context) error {
|
func (s *Server) configureRuntime(ctx context.Context) error {
|
||||||
|
applyHostSysctls(s)
|
||||||
|
|
||||||
if err := s.firstAvailableNodeConfig(ctx, 60*time.Second); err != nil {
|
if err := s.firstAvailableNodeConfig(ctx, 60*time.Second); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user