pkg/agent/ipam: IPAM allocator with dual-stack + IID embedding
Build flock Image / build (push) Has been cancelled

Core building block for M2 CNI ADD. Pure logic (no netlink), mutex-
serialized, seedable from committed state via MarkInUse. Hooks into
pkg/embed for ip-algo IID derivation.

- resolveEffective() implements the design-doc cidr6/cidr4 annotation
  rules: equal→node, supernet→node, subnet→ann, disjoint→error.
  First-match-wins across multiple annotation CIDRs.
- allocV6() random IID within the effective CIDR; on ip-algo, defers
  to embed.Embed. 16-retry on collision (regenerates IID or N nibble).
- allocV4() linear scan skipping .0 (network), .1 (gateway), .<last>
  (broadcast). Smallest supported block: /30 with 1 usable address.
- Deterministic fakeRand in tests covers: intersection matrix, random
  IID, embed path, collision→retry, v4 skip-gateway, v4 exhaustion,
  dual-stack, release-then-reallocate, family mismatch rejection.

No agent Run-loop integration yet — NewIPAM(nc.Spec.CIDR6, nc.Spec.CIDR4)
will be called from Server.Run once netlink + RPC are in place.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Donavan Fritz
2026-04-24 22:14:11 -05:00
parent 759ed21b37
commit c09c62fbaa
2 changed files with 671 additions and 0 deletions
+377
View File
@@ -0,0 +1,377 @@
package agent
import (
"crypto/rand"
"fmt"
"math/big"
"net"
"sync"
"code.fritzlab.net/fritzlab/flock/pkg/embed"
)
// IPAM allocates per-pod IPv6 and IPv4 addresses from the NodeConfig CIDRs.
//
// Invariants:
// - Concurrent callers serialize through mu.
// - Allocate() never returns an address already marked in-use. Collisions
// trigger retry (IPv6: up to 16 regenerations of the random N nibble or
// random IID; IPv4: linear scan of the effective /24 or smaller).
// - Release is idempotent.
//
// IPAM is constructed per-agent; it holds no durable state. The caller is
// responsible for marking pre-existing committed allocations in-use via
// MarkInUse during startup reconciliation.
type IPAM struct {
mu sync.Mutex
cidr6 []*net.IPNet
cidr4 []*net.IPNet
used map[string]struct{} // canonical IP strings
randSrc randSource
}
// randSource is injectable so tests can use a deterministic source.
type randSource interface {
// NibbleN returns a random 4-bit value.
NibbleN() byte
// FillIID fills dst with random bytes.
FillIID(dst []byte)
// PickIndex returns a pseudo-random int in [0, n). Tests may pin this.
PickIndex(n int) int
}
// cryptoRand uses crypto/rand for real allocations.
type cryptoRand struct{}
func (cryptoRand) NibbleN() byte {
var b [1]byte
_, _ = rand.Read(b[:])
return b[0] & 0x0F
}
func (cryptoRand) FillIID(dst []byte) {
_, _ = rand.Read(dst)
}
func (cryptoRand) PickIndex(n int) int {
if n <= 1 {
return 0
}
big_, _ := rand.Int(rand.Reader, big.NewInt(int64(n)))
return int(big_.Int64())
}
// AllocRequest describes a pending allocation. Values come from Pod metadata
// + annotations at CNI ADD time.
type AllocRequest struct {
ContainerID string
Namespace string
Pod string
// WantV6 / WantV4 come from the ipv6 / ipv4 annotations (defaults in
// design doc: ipv6=true, ipv4=false).
WantV6 bool
WantV4 bool
// AnnCIDR6 / AnnCIDR4 come from the cidr6 / cidr4 annotations. Empty
// means "use any of the node's CIDRs".
AnnCIDR6 []*net.IPNet
AnnCIDR4 []*net.IPNet
// IPAlgo comes from the ip-algo annotation. Empty means random IID.
IPAlgo []embed.Field
// ImageDigest is the sha256 manifest digest (with or without "sha256:"
// prefix). If empty, embed.Values.ImageFallback = ContainerID is used
// for ip-algo fields that reference image.
ImageDigest string
}
// AllocResult is what the IPAM hands back to the CNI ADD.
type AllocResult struct {
IP6 net.IP // nil if WantV6 was false
IP4 net.IP
}
// NewIPAM constructs an IPAM seeded from NodeConfig CIDRs.
func NewIPAM(cidr6, cidr4 []string) (*IPAM, error) {
i := &IPAM{
used: map[string]struct{}{},
randSrc: cryptoRand{},
}
for _, s := range cidr6 {
_, n, err := net.ParseCIDR(s)
if err != nil {
return nil, fmt.Errorf("cidr6 %q: %w", s, err)
}
if n.IP.To4() != nil {
return nil, fmt.Errorf("cidr6 %q is IPv4", s)
}
i.cidr6 = append(i.cidr6, n)
}
for _, s := range cidr4 {
_, n, err := net.ParseCIDR(s)
if err != nil {
return nil, fmt.Errorf("cidr4 %q: %w", s, err)
}
if n.IP.To4() == nil {
return nil, fmt.Errorf("cidr4 %q is not IPv4", s)
}
i.cidr4 = append(i.cidr4, n)
}
return i, nil
}
// MarkInUse records that `ip` is already taken. Used during startup
// reconciliation to seed the allocator from committed state.
func (i *IPAM) MarkInUse(ip net.IP) {
if ip == nil {
return
}
i.mu.Lock()
i.used[canonical(ip)] = struct{}{}
i.mu.Unlock()
}
// Release marks one or more IPs as free. Safe to call for IPs not in use.
func (i *IPAM) Release(ips ...net.IP) {
i.mu.Lock()
defer i.mu.Unlock()
for _, ip := range ips {
if ip != nil {
delete(i.used, canonical(ip))
}
}
}
// Allocate resolves effective CIDRs from the request + node CIDRs, picks
// addresses, and records them as in-use. It is atomic with respect to other
// Allocate / Release / MarkInUse calls.
func (i *IPAM) Allocate(req AllocRequest) (AllocResult, error) {
if !req.WantV6 && !req.WantV4 {
return AllocResult{}, fmt.Errorf("pod %s/%s: at least one of ipv6/ipv4 must be true", req.Namespace, req.Pod)
}
i.mu.Lock()
defer i.mu.Unlock()
var out AllocResult
var claimed []net.IP
rollback := func() {
for _, ip := range claimed {
delete(i.used, canonical(ip))
}
}
if req.WantV6 {
eff, err := resolveEffective(req.AnnCIDR6, i.cidr6)
if err != nil {
return AllocResult{}, fmt.Errorf("pod %s/%s cidr6: %w", req.Namespace, req.Pod, err)
}
cidr := eff[i.randSrc.PickIndex(len(eff))]
ip, err := i.allocV6(cidr, req)
if err != nil {
rollback()
return AllocResult{}, err
}
i.used[canonical(ip)] = struct{}{}
claimed = append(claimed, ip)
out.IP6 = ip
}
if req.WantV4 {
eff, err := resolveEffective(req.AnnCIDR4, i.cidr4)
if err != nil {
rollback()
return AllocResult{}, fmt.Errorf("pod %s/%s cidr4: %w", req.Namespace, req.Pod, err)
}
cidr := eff[i.randSrc.PickIndex(len(eff))]
ip, err := i.allocV4(cidr)
if err != nil {
rollback()
return AllocResult{}, err
}
i.used[canonical(ip)] = struct{}{}
out.IP4 = ip
}
return out, nil
}
// allocV6 picks an IPv6 /128 from the given CIDR, retrying on collision.
// Caller holds i.mu.
func (i *IPAM) allocV6(cidr *net.IPNet, req AllocRequest) (net.IP, error) {
const maxAttempts = 16
for attempt := 0; attempt < maxAttempts; attempt++ {
var ip net.IP
var err error
if len(req.IPAlgo) == 0 {
ip, err = i.randomV6(cidr)
} else {
ip, err = embed.Embed(cidr, req.IPAlgo, embed.Values{
Namespace: req.Namespace,
Pod: req.Pod,
Image: req.ImageDigest,
ImageFallback: req.ContainerID,
}, i.randSrc.NibbleN())
}
if err != nil {
return nil, err
}
if _, clash := i.used[canonical(ip)]; !clash {
return ip, nil
}
}
return nil, fmt.Errorf("IPv6 allocation: %d collisions in %s — giving up", maxAttempts, cidr)
}
// randomV6 picks a random /128 inside cidr. The network prefix bits are
// preserved from cidr.IP; the host bits are filled from the random source.
func (i *IPAM) randomV6(cidr *net.IPNet) (net.IP, error) {
ones, bits := cidr.Mask.Size()
if bits != 128 {
return nil, fmt.Errorf("cidr %s is not IPv6", cidr)
}
out := make(net.IP, 16)
copy(out, cidr.IP.To16())
hostBits := 128 - ones
rnd := make([]byte, 16)
i.randSrc.FillIID(rnd)
// Merge rnd into out where mask bit is 0.
for b := 0; b < 16; b++ {
// Host bits start at bit index `ones`, byte `b`.
byteStart := b * 8
byteEnd := byteStart + 8
if byteEnd <= ones {
continue // entirely network
}
if byteStart >= ones {
out[b] = rnd[b] // entirely host
continue
}
// Split byte: top (ones-byteStart) bits are network, rest is host.
networkBits := ones - byteStart
hostMask := byte(0xFF) >> uint(networkBits)
out[b] = (out[b] & ^hostMask) | (rnd[b] & hostMask)
}
_ = hostBits
return out, nil
}
// allocV4 walks the CIDR linearly skipping network + broadcast addresses.
// Caller holds i.mu.
func (i *IPAM) allocV4(cidr *net.IPNet) (net.IP, error) {
ones, _ := cidr.Mask.Size()
total := uint64(1) << uint(32-ones)
if total < 4 {
return nil, fmt.Errorf("cidr %s has no usable host space", cidr)
}
base := ipToU32(cidr.IP.To4())
// Skip .0 (network) and .1 (reserved for gateway / routing convention) up to .<broadcast-1>.
for off := uint64(2); off < total-1; off++ {
ip := u32ToIP(base + uint32(off))
if _, clash := i.used[canonical(ip)]; !clash {
return ip, nil
}
}
return nil, fmt.Errorf("IPv4 allocation: %s exhausted", cidr)
}
// resolveEffective applies the cidr6/cidr4 annotation → node CIDR intersection
// rules from the design doc. Returns the list of CIDRs the allocator may
// actually allocate from. No intersection → error.
//
// Rules (from dfritz-cni.md):
//
// annCIDR == nodeCIDR → allocate from nodeCIDR
// annCIDR supernet of → allocate from the more specific nodeCIDR
// annCIDR subnet of → allocate from annCIDR (more restrictive)
// no overlap → caller error
//
// If annCIDRs is empty, all nodeCIDRs are eligible.
// If annCIDRs has multiple entries, the *first* that intersects any
// nodeCIDR wins, matching the design-doc phrasing "the agent uses the first
// one that intersects a node CIDR".
func resolveEffective(annCIDRs []*net.IPNet, nodeCIDRs []*net.IPNet) ([]*net.IPNet, error) {
if len(nodeCIDRs) == 0 {
return nil, fmt.Errorf("node has no CIDRs configured for this family")
}
if len(annCIDRs) == 0 {
return nodeCIDRs, nil
}
for _, ann := range annCIDRs {
var matches []*net.IPNet
for _, node := range nodeCIDRs {
m := intersectCIDR(ann, node)
if m != nil {
matches = append(matches, m)
}
}
if len(matches) > 0 {
return matches, nil
}
}
return nil, fmt.Errorf("annotation CIDRs %v do not intersect any node CIDR %v",
toStringSlice(annCIDRs), toStringSlice(nodeCIDRs))
}
// intersectCIDR returns the effective allocation range between an annotation
// CIDR and a node CIDR, or nil if disjoint.
func intersectCIDR(ann, node *net.IPNet) *net.IPNet {
// Same address family only.
if (ann.IP.To4() == nil) != (node.IP.To4() == nil) {
return nil
}
switch {
case cidrEqual(ann, node):
return node
case cidrContains(ann, node):
// ann is supernet of node → allocate from node.
return node
case cidrContains(node, ann):
// ann is subnet of node → allocate from ann.
return ann
default:
return nil
}
}
func cidrEqual(a, b *net.IPNet) bool {
if !a.IP.Equal(b.IP) {
return false
}
ao, _ := a.Mask.Size()
bo, _ := b.Mask.Size()
return ao == bo
}
// cidrContains returns true if `a` is a strict supernet of `b` (a ⊋ b).
func cidrContains(a, b *net.IPNet) bool {
ao, _ := a.Mask.Size()
bo, _ := b.Mask.Size()
if ao >= bo {
return false
}
return a.Contains(b.IP)
}
func toStringSlice(ns []*net.IPNet) []string {
out := make([]string, len(ns))
for i, n := range ns {
out[i] = n.String()
}
return out
}
func canonical(ip net.IP) string {
if v4 := ip.To4(); v4 != nil {
return v4.String()
}
return ip.To16().String()
}
func ipToU32(ip net.IP) uint32 {
v4 := ip.To4()
return uint32(v4[0])<<24 | uint32(v4[1])<<16 | uint32(v4[2])<<8 | uint32(v4[3])
}
func u32ToIP(u uint32) net.IP {
return net.IPv4(byte(u>>24), byte(u>>16), byte(u>>8), byte(u)).To4()
}
+294
View File
@@ -0,0 +1,294 @@
package agent
import (
"net"
"testing"
"code.fritzlab.net/fritzlab/flock/pkg/embed"
)
// fakeRand is a deterministic randSource for tests.
type fakeRand struct {
nibbles []byte // queue of NibbleN results
iids [][]byte
picks []int
n, i, p int
}
func (f *fakeRand) NibbleN() byte {
v := f.nibbles[f.n%len(f.nibbles)]
f.n++
return v & 0x0F
}
func (f *fakeRand) FillIID(dst []byte) {
src := f.iids[f.i%len(f.iids)]
f.i++
copy(dst, src)
}
func (f *fakeRand) PickIndex(n int) int {
if len(f.picks) == 0 {
return 0
}
v := f.picks[f.p%len(f.picks)]
f.p++
if v >= n {
return 0
}
return v
}
func mustNet(t *testing.T, s string) *net.IPNet {
t.Helper()
_, n, err := net.ParseCIDR(s)
if err != nil {
t.Fatal(err)
}
return n
}
func TestIntersectCIDR(t *testing.T) {
v6a := mustNet(t, "2602:817:3000:f001::/64")
v6super := mustNet(t, "2602:817:3000::/48")
v6sub := mustNet(t, "2602:817:3000:f001::/96")
v6other := mustNet(t, "2602:817:3000:f002::/64")
v4 := mustNet(t, "10.0.0.0/24")
cases := []struct {
name string
ann *net.IPNet
node *net.IPNet
want *net.IPNet
}{
{"equal → node", v6a, v6a, v6a},
{"supernet of node → node", v6super, v6a, v6a},
{"subnet of node → ann", v6sub, v6a, v6sub},
{"disjoint same family → nil", v6other, v6a, nil},
{"different family → nil", v4, v6a, nil},
}
for _, c := range cases {
got := intersectCIDR(c.ann, c.node)
if (got == nil) != (c.want == nil) {
t.Errorf("%s: got=%v want=%v", c.name, got, c.want)
continue
}
if got != nil && !cidrEqual(got, c.want) {
t.Errorf("%s: got=%s want=%s", c.name, got, c.want)
}
}
}
func TestResolveEffective(t *testing.T) {
nodes := []*net.IPNet{
mustNet(t, "2602:817:3000:f001::/64"),
mustNet(t, "2602:817:3000:f002::/64"),
}
// Empty annotation → all node CIDRs.
if got, err := resolveEffective(nil, nodes); err != nil || len(got) != 2 {
t.Fatalf("empty ann: got=%v err=%v", got, err)
}
// Subnet of one node CIDR → that subnet only.
ann := []*net.IPNet{mustNet(t, "2602:817:3000:f001::/96")}
got, err := resolveEffective(ann, nodes)
if err != nil || len(got) != 1 || got[0].String() != "2602:817:3000:f001::/96" {
t.Fatalf("subnet ann: got=%v err=%v", got, err)
}
// Supernet matches both node CIDRs.
ann = []*net.IPNet{mustNet(t, "2602:817:3000::/48")}
got, err = resolveEffective(ann, nodes)
if err != nil || len(got) != 2 {
t.Fatalf("supernet ann: got=%v err=%v", got, err)
}
// First-match-wins: the first ann CIDR that intersects anything wins.
ann = []*net.IPNet{
mustNet(t, "2602:817:3000:ff::/64"), // no intersection
mustNet(t, "2602:817:3000:f001::/64"),
}
got, err = resolveEffective(ann, nodes)
if err != nil || len(got) != 1 || got[0].String() != "2602:817:3000:f001::/64" {
t.Fatalf("first-match: got=%v err=%v", got, err)
}
// No overlap → error.
ann = []*net.IPNet{mustNet(t, "2602:817:3000:ff::/64")}
if _, err := resolveEffective(ann, nodes); err == nil {
t.Fatalf("expected error for disjoint ann")
}
}
func TestIPAM_AllocV6_Random(t *testing.T) {
i, err := NewIPAM([]string{"2602:817:3000:f001::/64"}, nil)
if err != nil {
t.Fatal(err)
}
i.randSrc = &fakeRand{
iids: [][]byte{
// 16 bytes; only the low 8 bytes are used for a /64 host portion.
{0, 0, 0, 0, 0, 0, 0, 0, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0x11, 0x22},
},
}
res, err := i.Allocate(AllocRequest{
ContainerID: "c1", Namespace: "ns", Pod: "p", WantV6: true,
})
if err != nil {
t.Fatalf("Allocate: %v", err)
}
if res.IP6.String() != "2602:817:3000:f001:aabb:ccdd:eeff:1122" {
t.Fatalf("IP6 = %s, want 2602:817:3000:f001:aabb:ccdd:eeff:1122", res.IP6)
}
}
func TestIPAM_AllocV6_WithEmbed(t *testing.T) {
i, err := NewIPAM([]string{"2602:817:3000:f001::/64"}, nil)
if err != nil {
t.Fatal(err)
}
i.randSrc = &fakeRand{nibbles: []byte{0xe}}
res, err := i.Allocate(AllocRequest{
ContainerID: "c1", Namespace: "mail", Pod: "stalwart-0", WantV6: true,
IPAlgo: []embed.Field{embed.FieldNamespace, embed.FieldPod, embed.FieldImage},
})
if err != nil {
t.Fatalf("Allocate: %v", err)
}
// Last nibble from fakeRand NibbleN → 0xe.
if got := res.IP6[15] & 0x0F; got != 0xe {
t.Fatalf("last nibble = %x, want e", got)
}
mustNet(t, "2602:817:3000:f001::/64").Contains(res.IP6)
}
func TestIPAM_AllocV6_CollisionRetry(t *testing.T) {
i, err := NewIPAM([]string{"2602:817:3000:f001::/64"}, nil)
if err != nil {
t.Fatal(err)
}
// Mark one specific address in-use, then feed the allocator that same
// address first and a new one on retry.
first := net.ParseIP("2602:817:3000:f001::1")
second := net.ParseIP("2602:817:3000:f001::2")
i.MarkInUse(first)
i.randSrc = &fakeRand{
iids: [][]byte{
append(make([]byte, 8), 0, 0, 0, 0, 0, 0, 0, 0x01), // -> ...:1 (collides)
append(make([]byte, 8), 0, 0, 0, 0, 0, 0, 0, 0x02), // -> ...:2 (ok)
},
}
res, err := i.Allocate(AllocRequest{
ContainerID: "c1", Namespace: "ns", Pod: "p", WantV6: true,
})
if err != nil {
t.Fatalf("Allocate: %v", err)
}
if !res.IP6.Equal(second) {
t.Fatalf("IP6 = %s, want %s (collision retry)", res.IP6, second)
}
}
func TestIPAM_AllocV4_SkipsNetworkAndGateway(t *testing.T) {
i, err := NewIPAM(nil, []string{"172.25.210.0/24"})
if err != nil {
t.Fatal(err)
}
i.randSrc = &fakeRand{}
res, err := i.Allocate(AllocRequest{
ContainerID: "c1", Namespace: "ns", Pod: "p", WantV4: true,
})
if err != nil {
t.Fatalf("Allocate: %v", err)
}
if res.IP4.String() != "172.25.210.2" {
t.Fatalf("IP4 = %s, want 172.25.210.2 (skip .0 network + .1 gateway)", res.IP4)
}
}
func TestIPAM_AllocV4_Sequential(t *testing.T) {
i, err := NewIPAM(nil, []string{"172.25.210.0/29"})
if err != nil {
t.Fatal(err)
}
i.randSrc = &fakeRand{}
want := []string{"172.25.210.2", "172.25.210.3", "172.25.210.4", "172.25.210.5", "172.25.210.6"}
for _, w := range want {
res, err := i.Allocate(AllocRequest{ContainerID: w, WantV4: true})
if err != nil {
t.Fatalf("Allocate: %v", err)
}
if res.IP4.String() != w {
t.Fatalf("got %s, want %s", res.IP4, w)
}
}
// Next allocation should fail — /29 exhausted (.0 network, .1 gateway, .7 broadcast).
if _, err := i.Allocate(AllocRequest{ContainerID: "extra", WantV4: true}); err == nil {
t.Fatalf("expected exhaustion error")
}
}
func TestIPAM_DualStack(t *testing.T) {
i, err := NewIPAM(
[]string{"2602:817:3000:f001::/64"},
[]string{"172.25.210.0/24"},
)
if err != nil {
t.Fatal(err)
}
i.randSrc = &fakeRand{
iids: [][]byte{
{0, 0, 0, 0, 0, 0, 0, 0, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0x11, 0x22},
},
}
res, err := i.Allocate(AllocRequest{
ContainerID: "c1", WantV6: true, WantV4: true,
})
if err != nil {
t.Fatalf("Allocate: %v", err)
}
if res.IP6 == nil || res.IP4 == nil {
t.Fatalf("dual-stack result missing IPs: %+v", res)
}
if res.IP4.String() != "172.25.210.2" {
t.Fatalf("IP4 = %s", res.IP4)
}
}
func TestIPAM_RejectsNoFamily(t *testing.T) {
i, _ := NewIPAM([]string{"2602:817:3000:f001::/64"}, nil)
if _, err := i.Allocate(AllocRequest{ContainerID: "c"}); err == nil {
t.Fatalf("expected error for no v6/v4")
}
}
func TestIPAM_Release(t *testing.T) {
i, _ := NewIPAM([]string{"2602:817:3000:f001::/64"}, nil)
i.randSrc = &fakeRand{
iids: [][]byte{
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1},
},
}
res1, err := i.Allocate(AllocRequest{ContainerID: "c1", WantV6: true})
if err != nil {
t.Fatal(err)
}
i.Release(res1.IP6)
// Next Allocate should be free to pick the same address again.
res2, err := i.Allocate(AllocRequest{ContainerID: "c2", WantV6: true})
if err != nil {
t.Fatal(err)
}
if !res2.IP6.Equal(res1.IP6) {
t.Fatalf("expected release to free %s; got %s", res1.IP6, res2.IP6)
}
}
func TestIPAM_RejectsMismatchedFamily(t *testing.T) {
if _, err := NewIPAM([]string{"10.0.0.0/24"}, nil); err == nil {
t.Fatalf("expected v4 in cidr6 to fail")
}
if _, err := NewIPAM(nil, []string{"2602:817:3000::/64"}); err == nil {
t.Fatalf("expected v6 in cidr4 to fail")
}
}