From a17d33e182cc809a1731ab91c170559e060cb8d3 Mon Sep 17 00:00:00 2001 From: Donavan Fritz Date: Wed, 29 Apr 2026 09:46:48 -0500 Subject: [PATCH] agent: addresses annotation replaces IPAM allocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When flock.fritzlab.net/addresses provides a v6 or v4, the IP becomes the pod's primary IP for that family — bound to eth0, default route off it, on-link host route via setHostRoute, and a per-pod /128 or /32 in BGP. IPAM no longer allocates a private IP alongside it. The pod ends up with exactly the operator-supplied addresses on eth0 (plus any extras beyond the first-of-family, which keep the pre-existing layered behavior). This is the fix the original addresses-annotation work missed: bug #1 allocated a private IP next to the public one (so VPN-routed clients could land on the private path on Plex). Promoting addresses-supplied IPs into the IPAM-style routing slot keeps the public IP as the only primary IP visible from outside. Three pieces: - annotations.go: reject pods whose addresses/anycast IP family is disabled (ipv6/ipv4 annotation or NodeConfig default). Both annotation types rely on the family being enabled for return-path routing. - handlers.go: peel first v6 + first v4 from Addresses into res.IP6/IP4; suppress IPAM for those families; skip IPAM call entirely if both families are addresses-supplied. - anycast_linux.go: extend renderBird to advertise any IPAM IP that's outside the node's BGP aggregate as a per-pod /32 or /128. This is what makes 142.202.202.166 reachable when host004's pod CIDR is 172.25.214.0/24 — the addresses-promoted IP isn't covered by the aggregate. Tests: 7 new annotation tests covering the conflict cases (ipv4=false + addresses-v4, NodeConfig default + addresses-v4, etc.) plus 5 unit tests for the splitAddressesPrimary helper. README updated with the addresses-replaces-IPAM behavior, the addresses-vs-anycast comparison, the conflict rule, and a Plex-style example. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 59 +++++++++++++++++++++++ pkg/agent/annotations.go | 35 ++++++++++++++ pkg/agent/annotations_test.go | 91 +++++++++++++++++++++++++++++++++++ pkg/agent/anycast_linux.go | 60 +++++++++++++++++++++-- pkg/agent/handlers.go | 64 +++++++++++++++++++++--- pkg/agent/handlers_test.go | 78 ++++++++++++++++++++++++++++++ 6 files changed, 376 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index b1d808f..82b8fd0 100644 --- a/README.md +++ b/README.md @@ -176,12 +176,48 @@ optional; leave them off to inherit the per-node defaults. | `flock.fritzlab.net/cidr4` | CIDRs | Restrict IPv4 allocation to a sub-range of the node's `cidr4`. Comma-separated. | | `flock.fritzlab.net/ip-algo` | list | Embed identity into the IPv6 IID. Subset of `namespace,pod,image`, in order, comma-separated. | | `flock.fritzlab.net/anycast` | IPs | Bind these IPs on the pod's `lo`; advertise via BGP while pod is `Ready`. Mixed v6+v4 ok. | +| `flock.fritzlab.net/addresses` | IPs | Bind these IPs on the pod's `eth0`. The first v6 and first v4 **replace** IPAM allocation for that family — the addresses IP becomes the pod's primary IP. Mixed v6+v4 ok. Single-replica only in practice. | Bool values must be the literal strings `"true"` or `"false"` (case-insensitive, surrounding whitespace tolerated). Other values — `1`, `0`, `yes`, `no` — are rejected so a typo can't silently flip behaviour. +### `addresses` vs `anycast` + +Both annotations bind operator-supplied IPs onto a pod and have flock +advertise `/128` (or `/32`) per-pod over BGP. The differences are +where the IP lands and what it's for: + +| | `anycast` | `addresses` | +|----------------------------|----------------------------------------------------|-------------------------------------------------------------------| +| Bound on | pod `lo` | pod `eth0` | +| Multi-replica? | yes — every Ready replica advertises the same IP and the upstream router ECMPs across them | no — the same IP on multiple replicas is operator error | +| Replaces IPAM? | no — pod still has an IPAM-allocated unicast IP | **yes** — the first v6 + first v4 in the list become the pod's primary IPs in place of an IPAM allocation | +| Workload visibility | only the IPAM IP is on the primary interface | the public IP is `eth0`'s primary address — workloads that read their own NIC see it (e.g. Plex's remote-access detection) | + +Use `anycast` for shared services with many replicas (DNS, ingress). +Use `addresses` when one specific pod needs a known public IP that the +workload itself must see on its primary interface. + +### Conflict detection + +`addresses` and `anycast` reject pods that supply an IP whose family is +disabled. If the resolved `WantV4` is false (via the pod's `ipv4` +annotation or the NodeConfig default) and any addresses- or +anycast-supplied IP is IPv4, the CNI ADD fails with an explicit error. +Same for v6. Both annotation types put IPs on a pod interface and rely +on the family being enabled for return-path routing — silently accepting +the IP would leave a non-functional pod. + +### Outside-aggregate advertisement + +When an `addresses` IP replaces IPAM (becomes the pod's primary IP) the +IP is typically **outside** the node's BGP aggregate (e.g. a public +`/32` on a node whose pod CIDR is private). flock notices this during +BGP rendering and advertises the IP individually as a per-pod `/32` or +`/128` so the upstream router has a route to it. + ### Example pods Default dual-stack — no annotations needed: @@ -239,6 +275,29 @@ spec: failureThreshold: 1 ``` +Workload with a known public IP — single-replica pod whose application +inspects its own primary interface (Plex's remote-access flow). The +addresses become the pod's primary IPs in place of any IPAM allocation; +the pod's `eth0` ends up with exactly the supplied addresses, and BGP +advertises them as a `/128` and `/32`: + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: plex +spec: + replicas: 1 + template: + metadata: + annotations: + flock.fritzlab.net/addresses: "2001:db8:c606::166, 192.0.2.166" + spec: + containers: + - name: plex + image: plexinc/pms-docker +``` + ## Use cases **Highly-available DNS.** Run N CoreDNS replicas, each annotated with diff --git a/pkg/agent/annotations.go b/pkg/agent/annotations.go index 344ab34..d3e7081 100644 --- a/pkg/agent/annotations.go +++ b/pkg/agent/annotations.go @@ -166,9 +166,44 @@ func ParseAnnotations(in map[string]string, defaults FamilyDefaults) (*ParsedAnn out.Addresses = ips } + // Reject pods that ask for an addresses- or anycast-supplied IP whose + // family was disabled (via the pod's ipv6/ipv4 annotation or NodeConfig + // default). Both annotation types put the IP on a pod interface and rely + // on the family being enabled for return-path routing — addresses needs + // the in-pod default v6/v4 route to send replies; anycast on lo needs + // the same default route on eth0 for the same reason. Silently accepting + // the IP would leave a non-functional pod, so we fail closed at ADD. + for _, ip := range out.Addresses { + if err := requireFamilyEnabled(ip, out.WantV6, out.WantV4, annAddresses); err != nil { + return nil, err + } + } + for _, ip := range out.Anycast { + if err := requireFamilyEnabled(ip, out.WantV6, out.WantV4, annAnycast); err != nil { + return nil, err + } + } + return out, nil } +// requireFamilyEnabled returns an error when ip's family was opted out via +// the resolved WantV6/WantV4 booleans (pod annotation > NodeConfig default > +// built-in dual-stack). The source string identifies which annotation +// supplied the conflicting IP so the operator's error message is specific. +func requireFamilyEnabled(ip net.IP, wantV6, wantV4 bool, source string) error { + if ip.To4() != nil { + if !wantV4 { + return fmt.Errorf("annotation %s: contains IPv4 %s but ipv4 is disabled (annotation or NodeConfig default)", source, ip) + } + return nil + } + if !wantV6 { + return fmt.Errorf("annotation %s: contains IPv6 %s but ipv6 is disabled (annotation or NodeConfig default)", source, ip) + } + return nil +} + // parseBoolAnnotation accepts only "true" or "false" (case-insensitive, // surrounding whitespace tolerated). All other values — including "1", "0", // "yes", "no" — are rejected so operator typos are caught loudly rather diff --git a/pkg/agent/annotations_test.go b/pkg/agent/annotations_test.go index edf87d6..46824b6 100644 --- a/pkg/agent/annotations_test.go +++ b/pkg/agent/annotations_test.go @@ -313,6 +313,97 @@ func TestParseAnnotations_Anycast_Mixed(t *testing.T) { } } +func TestParseAnnotations_Addresses_Mixed(t *testing.T) { + // Plex's case: one v6 and one v4 supplied via addresses, both families + // enabled (built-in defaults). Both IPs are recorded; conflict check + // passes; later in handlers.Add they get peeled into primary slots. + a, err := ParseAnnotations(map[string]string{ + annotationPrefix + "addresses": "2602:817:3000:c606::166, 142.202.202.166", + }, BuiltinFamilyDefaults()) + if err != nil { + t.Fatal(err) + } + if len(a.Addresses) != 2 { + t.Fatalf("addresses len=%d", len(a.Addresses)) + } +} + +func TestParseAnnotations_Addresses_ConflictV4Disabled(t *testing.T) { + // addresses contains a v4 but the pod has explicitly opted out of v4. + // The IP would land on eth0 with no default v4 route, so reject at ADD. + _, err := ParseAnnotations(map[string]string{ + annotationPrefix + "ipv4": "false", + annotationPrefix + "addresses": "142.202.202.166", + }, BuiltinFamilyDefaults()) + if err == nil { + t.Fatal("want error for ipv4=false + addresses v4, got nil") + } +} + +func TestParseAnnotations_Addresses_ConflictV6Disabled(t *testing.T) { + _, err := ParseAnnotations(map[string]string{ + annotationPrefix + "ipv6": "false", + annotationPrefix + "ipv4": "true", + annotationPrefix + "addresses": "2602:817:3000:c606::166", + }, BuiltinFamilyDefaults()) + if err == nil { + t.Fatal("want error for ipv6=false + addresses v6, got nil") + } +} + +func TestParseAnnotations_Anycast_ConflictV4Disabled(t *testing.T) { + // Anycast on lo also requires the family enabled — replies need the + // in-pod default v4 route off eth0, which only exists when v4 is on. + _, err := ParseAnnotations(map[string]string{ + annotationPrefix + "ipv4": "false", + annotationPrefix + "anycast": "172.25.255.1", + }, BuiltinFamilyDefaults()) + if err == nil { + t.Fatal("want error for ipv4=false + anycast v4, got nil") + } +} + +func TestParseAnnotations_Anycast_ConflictV6Disabled(t *testing.T) { + _, err := ParseAnnotations(map[string]string{ + annotationPrefix + "ipv6": "false", + annotationPrefix + "ipv4": "true", + annotationPrefix + "anycast": "2602:817:3000:ac::1", + }, BuiltinFamilyDefaults()) + if err == nil { + t.Fatal("want error for ipv6=false + anycast v6, got nil") + } +} + +func TestParseAnnotations_Addresses_NodeDefaultV4Off(t *testing.T) { + // NodeConfig default opts v4 off for the node, and the pod has no + // explicit ipv4 annotation. addresses-v4 still conflicts because the + // resolved WantV4 is false. Operator must add `ipv4: "true"` on the + // pod to override the node default. + defaults := FamilyDefaults{WantV6: true, WantV4: false} + _, err := ParseAnnotations(map[string]string{ + annotationPrefix + "addresses": "142.202.202.166", + }, defaults) + if err == nil { + t.Fatal("want error for NodeConfig v4=false + addresses v4, got nil") + } +} + +func TestParseAnnotations_Addresses_NodeDefaultV4Off_PodOptsBackIn(t *testing.T) { + // Same as above but pod explicitly sets ipv4=true to override the node + // default. Conflict resolved; parse succeeds. + defaults := FamilyDefaults{WantV6: true, WantV4: false} + a, err := ParseAnnotations(map[string]string{ + annotationPrefix + "ipv4": "true", + annotationPrefix + "addresses": "142.202.202.166", + }, defaults) + if err != nil { + t.Fatalf("expected ok, got %v", err) + } + if !a.WantV4 || len(a.Addresses) != 1 { + t.Fatalf("unexpected: %+v", a) + } +} + func TestParseCNIArgs(t *testing.T) { args := ParseCNIArgs("IgnoreUnknown=1;K8S_POD_NAMESPACE=mail;K8S_POD_NAME=stalwart-0;K8S_POD_INFRA_CONTAINER_ID=abc123") if args.PodNamespace != "mail" || args.PodName != "stalwart-0" || args.InfraID != "abc123" { diff --git a/pkg/agent/anycast_linux.go b/pkg/agent/anycast_linux.go index dc0f907..53e4fd4 100644 --- a/pkg/agent/anycast_linux.go +++ b/pkg/agent/anycast_linux.go @@ -142,22 +142,74 @@ func (r *AnycastReconciler) renderBird(desired map[string]anycastTarget) { return } var v6, v4 []string - for ipStr := range desired { - ip := net.ParseIP(ipStr) - if ip == nil { - continue + seen := map[string]struct{}{} + add := func(ip net.IP) { + key := canonical(ip) + if _, dup := seen[key]; dup { + return } + seen[key] = struct{}{} if ip.To4() != nil { v4 = append(v4, ip.To4().String()) } else { v6 = append(v6, ip.To16().String()) } } + for ipStr := range desired { + if ip := net.ParseIP(ipStr); ip != nil { + add(ip) + } + } + // A pod IP that lives outside the node's BGP aggregate (e.g. an + // addresses-annotation IP promoted to be the pod's primary v4 — Plex's + // 142.202.202.166 against host004's 172.25.214.0/24) is not naturally + // covered by the aggregate, so it must be advertised individually as a + // /32 or /128. Anycast and addresses extras are already covered by the + // `desired` loop above; this sweep is for promoted-primary IPs which do + // not flow through the AnycastReconciler. + nodeV6, nodeV4 := parseNodeCIDRs(nc) + for _, a := range r.Store.Snapshot() { + if a.State != StateCommitted { + continue + } + if ip := net.ParseIP(a.IP6); ip != nil && !ipInAny(ip, nodeV6) { + add(ip) + } + if ip := net.ParseIP(a.IP4); ip != nil && !ipInAny(ip, nodeV4) { + add(ip) + } + } if err := r.Bird.Render(nc, v6, v4, r.RouterID); err != nil { r.Logger.Warn("anycast bird render", "err", err) } } +// parseNodeCIDRs parses NodeConfig.Spec.CIDR6/4 strings into IPNets, +// silently dropping malformed entries (admission-time validation should +// have rejected them long before this point). +func parseNodeCIDRs(nc *flockv1alpha1.NodeConfig) (v6, v4 []*net.IPNet) { + for _, s := range nc.Spec.CIDR6 { + if _, n, err := net.ParseCIDR(s); err == nil { + v6 = append(v6, n) + } + } + for _, s := range nc.Spec.CIDR4 { + if _, n, err := net.ParseCIDR(s); err == nil { + v4 = append(v4, n) + } + } + return +} + +func ipInAny(ip net.IP, nets []*net.IPNet) bool { + for _, n := range nets { + if n.Contains(ip) { + return true + } + } + return false +} + // installAnycastRoute installs `/<128|32>` pointing at the // nexthop set in t. With one nexthop the route is a plain via-route; // with multiple, it's a multipath route using RTA_MULTIPATH so the diff --git a/pkg/agent/handlers.go b/pkg/agent/handlers.go index db07b53..61ad351 100644 --- a/pkg/agent/handlers.go +++ b/pkg/agent/handlers.go @@ -140,21 +140,44 @@ func (h *PodHandler) Add(ctx context.Context, req flockcni.Request) (*current.Re } ipAlgo := ResolveIPAlgo(pod.Annotations, nodeAnn, h.Logger) + // addresses-annotation IPs replace IPAM allocation for any family they + // cover. Plex needs its public IPv4 to be the pod's primary v4 (default + // route source, on-link host route, /32 in BGP) — not just an extra IP + // layered on top of a private IPAM allocation. Peel one v6 + one v4 out + // of Addresses to use as the pod's primary IPs; anything beyond that + // stays in addrExtras and gets the existing layered behavior. + addrV6, addrV4, addrExtras := splitAddressesPrimary(parsed.Addresses) + allocReq := AllocRequest{ ContainerID: req.ContainerID, Namespace: args.PodNamespace, Pod: args.PodName, App: deriveAppName(pod), - WantV6: parsed.WantV6, - WantV4: parsed.WantV4, + WantV6: parsed.WantV6 && addrV6 == nil, + WantV4: parsed.WantV4 && addrV4 == nil, AnnCIDR6: parsed.CIDR6, AnnCIDR4: parsed.CIDR4, IPAlgo: ipAlgo, Image: podImageRef(pod), } - res, err := h.IPAM.Allocate(allocReq) - if err != nil { - return nil, fmt.Errorf("ipam: %w", err) + var res AllocResult + if allocReq.WantV6 || allocReq.WantV4 { + var err error + res, err = h.IPAM.Allocate(allocReq) + if err != nil { + return nil, fmt.Errorf("ipam: %w", err) + } + } + // Promote the peeled addresses IPs into the primary slots. They get the + // IPAM-style routing path: bound to eth0 in configurePodSide, default + // route via fe80::1 / v4ProxyGW, on-link host route via setHostRoute. + // BGP advertisement of the /32/128 is handled by the AnycastReconciler + // via renderBird's outside-aggregate detection. + if addrV6 != nil { + res.IP6 = addrV6 + } + if addrV4 != nil { + res.IP4 = addrV4 } // Persist pending entry before any netlink work so a crash mid-ADD @@ -167,7 +190,7 @@ func (h *PodHandler) Add(ctx context.Context, req flockcni.Request) (*current.Re IP6: ipString(res.IP6), IP4: ipString(res.IP4), Anycast: anycastStrings(parsed.Anycast), - Addresses: anycastStrings(parsed.Addresses), + Addresses: anycastStrings(addrExtras), State: StatePending, AllocatedAt: time.Now().UTC(), } @@ -184,7 +207,7 @@ func (h *PodHandler) Add(ctx context.Context, req flockcni.Request) (*current.Re IP6: res.IP6, IP4: res.IP4, Anycast: parsed.Anycast, - Addresses: parsed.Addresses, + Addresses: addrExtras, } if err := h.SetupFunc(setup); err != nil { // Roll forward: leave pending entry in place so startup GC can clean @@ -270,6 +293,33 @@ func ipString(ip net.IP) string { return canonical(ip) } +// splitAddressesPrimary peels off the first IPv6 and first IPv4 from the +// addresses list to use as the pod's primary IPs in place of an IPAM +// allocation. The remaining entries (anything beyond the first of each +// family) stay in extras for the existing layered eth0 binding via the +// AnycastReconciler's via-route path. +// +// Order of the input is preserved in extras. Either of v6/v4 may be nil +// when the addresses list contains no IP of that family — the caller falls +// back to IPAM allocation in that case. +func splitAddressesPrimary(ips []net.IP) (v6, v4 net.IP, extras []net.IP) { + for _, ip := range ips { + if ip.To4() != nil { + if v4 == nil { + v4 = ip.To4() + continue + } + } else { + if v6 == nil { + v6 = ip.To16() + continue + } + } + extras = append(extras, ip) + } + return +} + func anycastStrings(ips []net.IP) []string { if len(ips) == 0 { return nil diff --git a/pkg/agent/handlers_test.go b/pkg/agent/handlers_test.go index 9629fb5..1334ee3 100644 --- a/pkg/agent/handlers_test.go +++ b/pkg/agent/handlers_test.go @@ -1,6 +1,7 @@ package agent import ( + "net" "testing" corev1 "k8s.io/api/core/v1" @@ -106,3 +107,80 @@ func TestPodImageRef(t *testing.T) { t.Fatalf("got %q, want \"\"", got) } } + +func TestSplitAddressesPrimary_BothFamilies(t *testing.T) { + // Plex pattern: one v6 + one v4 → both peel out, no extras. + ips := []net.IP{ + net.ParseIP("2602:817:3000:c606::166"), + net.ParseIP("142.202.202.166"), + } + v6, v4, extras := splitAddressesPrimary(ips) + if v6 == nil || v6.String() != "2602:817:3000:c606::166" { + t.Fatalf("v6 = %v", v6) + } + if v4 == nil || v4.String() != "142.202.202.166" { + t.Fatalf("v4 = %v", v4) + } + if len(extras) != 0 { + t.Fatalf("extras = %v, want empty", extras) + } +} + +func TestSplitAddressesPrimary_OnlyV4(t *testing.T) { + v6, v4, extras := splitAddressesPrimary([]net.IP{net.ParseIP("142.202.202.166")}) + if v6 != nil { + t.Fatalf("v6 should be nil, got %v", v6) + } + if v4 == nil || v4.String() != "142.202.202.166" { + t.Fatalf("v4 = %v", v4) + } + if len(extras) != 0 { + t.Fatalf("extras = %v", extras) + } +} + +func TestSplitAddressesPrimary_OnlyV6(t *testing.T) { + v6, v4, extras := splitAddressesPrimary([]net.IP{net.ParseIP("2602:817:3000:c606::166")}) + if v4 != nil { + t.Fatalf("v4 should be nil, got %v", v4) + } + if v6 == nil || v6.String() != "2602:817:3000:c606::166" { + t.Fatalf("v6 = %v", v6) + } + if len(extras) != 0 { + t.Fatalf("extras = %v", extras) + } +} + +func TestSplitAddressesPrimary_Empty(t *testing.T) { + v6, v4, extras := splitAddressesPrimary(nil) + if v6 != nil || v4 != nil || extras != nil { + t.Fatalf("nil input should yield nil outputs, got v6=%v v4=%v extras=%v", v6, v4, extras) + } +} + +func TestSplitAddressesPrimary_Extras(t *testing.T) { + // Multiple v4s — only the first peels into the primary slot; the rest + // stay in extras for layered-eth0 binding via the AnycastReconciler. + // (Not a current production use case, but the code should handle it + // without dropping IPs.) + ips := []net.IP{ + net.ParseIP("142.202.202.166"), + net.ParseIP("2602:817:3000:c606::166"), + net.ParseIP("142.202.202.167"), + net.ParseIP("2602:817:3000:c606::167"), + } + v6, v4, extras := splitAddressesPrimary(ips) + if v4.String() != "142.202.202.166" { + t.Fatalf("v4 primary = %v, want 142.202.202.166", v4) + } + if v6.String() != "2602:817:3000:c606::166" { + t.Fatalf("v6 primary = %v, want 2602:817:3000:c606::166", v6) + } + if len(extras) != 2 { + t.Fatalf("extras len = %d, want 2", len(extras)) + } + if extras[0].String() != "142.202.202.167" || extras[1].String() != "2602:817:3000:c606::167" { + t.Fatalf("extras order/content wrong: %v", extras) + } +}