diff --git a/pkg/agent/ipam.go b/pkg/agent/ipam.go new file mode 100644 index 0000000..2ade078 --- /dev/null +++ b/pkg/agent/ipam.go @@ -0,0 +1,377 @@ +package agent + +import ( + "crypto/rand" + "fmt" + "math/big" + "net" + "sync" + + "code.fritzlab.net/fritzlab/flock/pkg/embed" +) + +// IPAM allocates per-pod IPv6 and IPv4 addresses from the NodeConfig CIDRs. +// +// Invariants: +// - Concurrent callers serialize through mu. +// - Allocate() never returns an address already marked in-use. Collisions +// trigger retry (IPv6: up to 16 regenerations of the random N nibble or +// random IID; IPv4: linear scan of the effective /24 or smaller). +// - Release is idempotent. +// +// IPAM is constructed per-agent; it holds no durable state. The caller is +// responsible for marking pre-existing committed allocations in-use via +// MarkInUse during startup reconciliation. +type IPAM struct { + mu sync.Mutex + cidr6 []*net.IPNet + cidr4 []*net.IPNet + used map[string]struct{} // canonical IP strings + randSrc randSource +} + +// randSource is injectable so tests can use a deterministic source. +type randSource interface { + // NibbleN returns a random 4-bit value. + NibbleN() byte + // FillIID fills dst with random bytes. + FillIID(dst []byte) + // PickIndex returns a pseudo-random int in [0, n). Tests may pin this. + PickIndex(n int) int +} + +// cryptoRand uses crypto/rand for real allocations. +type cryptoRand struct{} + +func (cryptoRand) NibbleN() byte { + var b [1]byte + _, _ = rand.Read(b[:]) + return b[0] & 0x0F +} + +func (cryptoRand) FillIID(dst []byte) { + _, _ = rand.Read(dst) +} + +func (cryptoRand) PickIndex(n int) int { + if n <= 1 { + return 0 + } + big_, _ := rand.Int(rand.Reader, big.NewInt(int64(n))) + return int(big_.Int64()) +} + +// AllocRequest describes a pending allocation. Values come from Pod metadata +// + annotations at CNI ADD time. +type AllocRequest struct { + ContainerID string + Namespace string + Pod string + // WantV6 / WantV4 come from the ipv6 / ipv4 annotations (defaults in + // design doc: ipv6=true, ipv4=false). + WantV6 bool + WantV4 bool + // AnnCIDR6 / AnnCIDR4 come from the cidr6 / cidr4 annotations. Empty + // means "use any of the node's CIDRs". + AnnCIDR6 []*net.IPNet + AnnCIDR4 []*net.IPNet + // IPAlgo comes from the ip-algo annotation. Empty means random IID. + IPAlgo []embed.Field + // ImageDigest is the sha256 manifest digest (with or without "sha256:" + // prefix). If empty, embed.Values.ImageFallback = ContainerID is used + // for ip-algo fields that reference image. + ImageDigest string +} + +// AllocResult is what the IPAM hands back to the CNI ADD. +type AllocResult struct { + IP6 net.IP // nil if WantV6 was false + IP4 net.IP +} + +// NewIPAM constructs an IPAM seeded from NodeConfig CIDRs. +func NewIPAM(cidr6, cidr4 []string) (*IPAM, error) { + i := &IPAM{ + used: map[string]struct{}{}, + randSrc: cryptoRand{}, + } + for _, s := range cidr6 { + _, n, err := net.ParseCIDR(s) + if err != nil { + return nil, fmt.Errorf("cidr6 %q: %w", s, err) + } + if n.IP.To4() != nil { + return nil, fmt.Errorf("cidr6 %q is IPv4", s) + } + i.cidr6 = append(i.cidr6, n) + } + for _, s := range cidr4 { + _, n, err := net.ParseCIDR(s) + if err != nil { + return nil, fmt.Errorf("cidr4 %q: %w", s, err) + } + if n.IP.To4() == nil { + return nil, fmt.Errorf("cidr4 %q is not IPv4", s) + } + i.cidr4 = append(i.cidr4, n) + } + return i, nil +} + +// MarkInUse records that `ip` is already taken. Used during startup +// reconciliation to seed the allocator from committed state. +func (i *IPAM) MarkInUse(ip net.IP) { + if ip == nil { + return + } + i.mu.Lock() + i.used[canonical(ip)] = struct{}{} + i.mu.Unlock() +} + +// Release marks one or more IPs as free. Safe to call for IPs not in use. +func (i *IPAM) Release(ips ...net.IP) { + i.mu.Lock() + defer i.mu.Unlock() + for _, ip := range ips { + if ip != nil { + delete(i.used, canonical(ip)) + } + } +} + +// Allocate resolves effective CIDRs from the request + node CIDRs, picks +// addresses, and records them as in-use. It is atomic with respect to other +// Allocate / Release / MarkInUse calls. +func (i *IPAM) Allocate(req AllocRequest) (AllocResult, error) { + if !req.WantV6 && !req.WantV4 { + return AllocResult{}, fmt.Errorf("pod %s/%s: at least one of ipv6/ipv4 must be true", req.Namespace, req.Pod) + } + + i.mu.Lock() + defer i.mu.Unlock() + + var out AllocResult + var claimed []net.IP + + rollback := func() { + for _, ip := range claimed { + delete(i.used, canonical(ip)) + } + } + + if req.WantV6 { + eff, err := resolveEffective(req.AnnCIDR6, i.cidr6) + if err != nil { + return AllocResult{}, fmt.Errorf("pod %s/%s cidr6: %w", req.Namespace, req.Pod, err) + } + cidr := eff[i.randSrc.PickIndex(len(eff))] + ip, err := i.allocV6(cidr, req) + if err != nil { + rollback() + return AllocResult{}, err + } + i.used[canonical(ip)] = struct{}{} + claimed = append(claimed, ip) + out.IP6 = ip + } + + if req.WantV4 { + eff, err := resolveEffective(req.AnnCIDR4, i.cidr4) + if err != nil { + rollback() + return AllocResult{}, fmt.Errorf("pod %s/%s cidr4: %w", req.Namespace, req.Pod, err) + } + cidr := eff[i.randSrc.PickIndex(len(eff))] + ip, err := i.allocV4(cidr) + if err != nil { + rollback() + return AllocResult{}, err + } + i.used[canonical(ip)] = struct{}{} + out.IP4 = ip + } + + return out, nil +} + +// allocV6 picks an IPv6 /128 from the given CIDR, retrying on collision. +// Caller holds i.mu. +func (i *IPAM) allocV6(cidr *net.IPNet, req AllocRequest) (net.IP, error) { + const maxAttempts = 16 + for attempt := 0; attempt < maxAttempts; attempt++ { + var ip net.IP + var err error + if len(req.IPAlgo) == 0 { + ip, err = i.randomV6(cidr) + } else { + ip, err = embed.Embed(cidr, req.IPAlgo, embed.Values{ + Namespace: req.Namespace, + Pod: req.Pod, + Image: req.ImageDigest, + ImageFallback: req.ContainerID, + }, i.randSrc.NibbleN()) + } + if err != nil { + return nil, err + } + if _, clash := i.used[canonical(ip)]; !clash { + return ip, nil + } + } + return nil, fmt.Errorf("IPv6 allocation: %d collisions in %s — giving up", maxAttempts, cidr) +} + +// randomV6 picks a random /128 inside cidr. The network prefix bits are +// preserved from cidr.IP; the host bits are filled from the random source. +func (i *IPAM) randomV6(cidr *net.IPNet) (net.IP, error) { + ones, bits := cidr.Mask.Size() + if bits != 128 { + return nil, fmt.Errorf("cidr %s is not IPv6", cidr) + } + out := make(net.IP, 16) + copy(out, cidr.IP.To16()) + hostBits := 128 - ones + rnd := make([]byte, 16) + i.randSrc.FillIID(rnd) + // Merge rnd into out where mask bit is 0. + for b := 0; b < 16; b++ { + // Host bits start at bit index `ones`, byte `b`. + byteStart := b * 8 + byteEnd := byteStart + 8 + if byteEnd <= ones { + continue // entirely network + } + if byteStart >= ones { + out[b] = rnd[b] // entirely host + continue + } + // Split byte: top (ones-byteStart) bits are network, rest is host. + networkBits := ones - byteStart + hostMask := byte(0xFF) >> uint(networkBits) + out[b] = (out[b] & ^hostMask) | (rnd[b] & hostMask) + } + _ = hostBits + return out, nil +} + +// allocV4 walks the CIDR linearly skipping network + broadcast addresses. +// Caller holds i.mu. +func (i *IPAM) allocV4(cidr *net.IPNet) (net.IP, error) { + ones, _ := cidr.Mask.Size() + total := uint64(1) << uint(32-ones) + if total < 4 { + return nil, fmt.Errorf("cidr %s has no usable host space", cidr) + } + base := ipToU32(cidr.IP.To4()) + // Skip .0 (network) and .1 (reserved for gateway / routing convention) up to .. + for off := uint64(2); off < total-1; off++ { + ip := u32ToIP(base + uint32(off)) + if _, clash := i.used[canonical(ip)]; !clash { + return ip, nil + } + } + return nil, fmt.Errorf("IPv4 allocation: %s exhausted", cidr) +} + +// resolveEffective applies the cidr6/cidr4 annotation → node CIDR intersection +// rules from the design doc. Returns the list of CIDRs the allocator may +// actually allocate from. No intersection → error. +// +// Rules (from dfritz-cni.md): +// +// annCIDR == nodeCIDR → allocate from nodeCIDR +// annCIDR supernet of → allocate from the more specific nodeCIDR +// annCIDR subnet of → allocate from annCIDR (more restrictive) +// no overlap → caller error +// +// If annCIDRs is empty, all nodeCIDRs are eligible. +// If annCIDRs has multiple entries, the *first* that intersects any +// nodeCIDR wins, matching the design-doc phrasing "the agent uses the first +// one that intersects a node CIDR". +func resolveEffective(annCIDRs []*net.IPNet, nodeCIDRs []*net.IPNet) ([]*net.IPNet, error) { + if len(nodeCIDRs) == 0 { + return nil, fmt.Errorf("node has no CIDRs configured for this family") + } + if len(annCIDRs) == 0 { + return nodeCIDRs, nil + } + for _, ann := range annCIDRs { + var matches []*net.IPNet + for _, node := range nodeCIDRs { + m := intersectCIDR(ann, node) + if m != nil { + matches = append(matches, m) + } + } + if len(matches) > 0 { + return matches, nil + } + } + return nil, fmt.Errorf("annotation CIDRs %v do not intersect any node CIDR %v", + toStringSlice(annCIDRs), toStringSlice(nodeCIDRs)) +} + +// intersectCIDR returns the effective allocation range between an annotation +// CIDR and a node CIDR, or nil if disjoint. +func intersectCIDR(ann, node *net.IPNet) *net.IPNet { + // Same address family only. + if (ann.IP.To4() == nil) != (node.IP.To4() == nil) { + return nil + } + switch { + case cidrEqual(ann, node): + return node + case cidrContains(ann, node): + // ann is supernet of node → allocate from node. + return node + case cidrContains(node, ann): + // ann is subnet of node → allocate from ann. + return ann + default: + return nil + } +} + +func cidrEqual(a, b *net.IPNet) bool { + if !a.IP.Equal(b.IP) { + return false + } + ao, _ := a.Mask.Size() + bo, _ := b.Mask.Size() + return ao == bo +} + +// cidrContains returns true if `a` is a strict supernet of `b` (a ⊋ b). +func cidrContains(a, b *net.IPNet) bool { + ao, _ := a.Mask.Size() + bo, _ := b.Mask.Size() + if ao >= bo { + return false + } + return a.Contains(b.IP) +} + +func toStringSlice(ns []*net.IPNet) []string { + out := make([]string, len(ns)) + for i, n := range ns { + out[i] = n.String() + } + return out +} + +func canonical(ip net.IP) string { + if v4 := ip.To4(); v4 != nil { + return v4.String() + } + return ip.To16().String() +} + +func ipToU32(ip net.IP) uint32 { + v4 := ip.To4() + return uint32(v4[0])<<24 | uint32(v4[1])<<16 | uint32(v4[2])<<8 | uint32(v4[3]) +} + +func u32ToIP(u uint32) net.IP { + return net.IPv4(byte(u>>24), byte(u>>16), byte(u>>8), byte(u)).To4() +} diff --git a/pkg/agent/ipam_test.go b/pkg/agent/ipam_test.go new file mode 100644 index 0000000..02eebec --- /dev/null +++ b/pkg/agent/ipam_test.go @@ -0,0 +1,294 @@ +package agent + +import ( + "net" + "testing" + + "code.fritzlab.net/fritzlab/flock/pkg/embed" +) + +// fakeRand is a deterministic randSource for tests. +type fakeRand struct { + nibbles []byte // queue of NibbleN results + iids [][]byte + picks []int + n, i, p int +} + +func (f *fakeRand) NibbleN() byte { + v := f.nibbles[f.n%len(f.nibbles)] + f.n++ + return v & 0x0F +} +func (f *fakeRand) FillIID(dst []byte) { + src := f.iids[f.i%len(f.iids)] + f.i++ + copy(dst, src) +} +func (f *fakeRand) PickIndex(n int) int { + if len(f.picks) == 0 { + return 0 + } + v := f.picks[f.p%len(f.picks)] + f.p++ + if v >= n { + return 0 + } + return v +} + +func mustNet(t *testing.T, s string) *net.IPNet { + t.Helper() + _, n, err := net.ParseCIDR(s) + if err != nil { + t.Fatal(err) + } + return n +} + +func TestIntersectCIDR(t *testing.T) { + v6a := mustNet(t, "2602:817:3000:f001::/64") + v6super := mustNet(t, "2602:817:3000::/48") + v6sub := mustNet(t, "2602:817:3000:f001::/96") + v6other := mustNet(t, "2602:817:3000:f002::/64") + v4 := mustNet(t, "10.0.0.0/24") + + cases := []struct { + name string + ann *net.IPNet + node *net.IPNet + want *net.IPNet + }{ + {"equal → node", v6a, v6a, v6a}, + {"supernet of node → node", v6super, v6a, v6a}, + {"subnet of node → ann", v6sub, v6a, v6sub}, + {"disjoint same family → nil", v6other, v6a, nil}, + {"different family → nil", v4, v6a, nil}, + } + for _, c := range cases { + got := intersectCIDR(c.ann, c.node) + if (got == nil) != (c.want == nil) { + t.Errorf("%s: got=%v want=%v", c.name, got, c.want) + continue + } + if got != nil && !cidrEqual(got, c.want) { + t.Errorf("%s: got=%s want=%s", c.name, got, c.want) + } + } +} + +func TestResolveEffective(t *testing.T) { + nodes := []*net.IPNet{ + mustNet(t, "2602:817:3000:f001::/64"), + mustNet(t, "2602:817:3000:f002::/64"), + } + + // Empty annotation → all node CIDRs. + if got, err := resolveEffective(nil, nodes); err != nil || len(got) != 2 { + t.Fatalf("empty ann: got=%v err=%v", got, err) + } + + // Subnet of one node CIDR → that subnet only. + ann := []*net.IPNet{mustNet(t, "2602:817:3000:f001::/96")} + got, err := resolveEffective(ann, nodes) + if err != nil || len(got) != 1 || got[0].String() != "2602:817:3000:f001::/96" { + t.Fatalf("subnet ann: got=%v err=%v", got, err) + } + + // Supernet matches both node CIDRs. + ann = []*net.IPNet{mustNet(t, "2602:817:3000::/48")} + got, err = resolveEffective(ann, nodes) + if err != nil || len(got) != 2 { + t.Fatalf("supernet ann: got=%v err=%v", got, err) + } + + // First-match-wins: the first ann CIDR that intersects anything wins. + ann = []*net.IPNet{ + mustNet(t, "2602:817:3000:ff::/64"), // no intersection + mustNet(t, "2602:817:3000:f001::/64"), + } + got, err = resolveEffective(ann, nodes) + if err != nil || len(got) != 1 || got[0].String() != "2602:817:3000:f001::/64" { + t.Fatalf("first-match: got=%v err=%v", got, err) + } + + // No overlap → error. + ann = []*net.IPNet{mustNet(t, "2602:817:3000:ff::/64")} + if _, err := resolveEffective(ann, nodes); err == nil { + t.Fatalf("expected error for disjoint ann") + } +} + +func TestIPAM_AllocV6_Random(t *testing.T) { + i, err := NewIPAM([]string{"2602:817:3000:f001::/64"}, nil) + if err != nil { + t.Fatal(err) + } + i.randSrc = &fakeRand{ + iids: [][]byte{ + // 16 bytes; only the low 8 bytes are used for a /64 host portion. + {0, 0, 0, 0, 0, 0, 0, 0, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0x11, 0x22}, + }, + } + res, err := i.Allocate(AllocRequest{ + ContainerID: "c1", Namespace: "ns", Pod: "p", WantV6: true, + }) + if err != nil { + t.Fatalf("Allocate: %v", err) + } + if res.IP6.String() != "2602:817:3000:f001:aabb:ccdd:eeff:1122" { + t.Fatalf("IP6 = %s, want 2602:817:3000:f001:aabb:ccdd:eeff:1122", res.IP6) + } +} + +func TestIPAM_AllocV6_WithEmbed(t *testing.T) { + i, err := NewIPAM([]string{"2602:817:3000:f001::/64"}, nil) + if err != nil { + t.Fatal(err) + } + i.randSrc = &fakeRand{nibbles: []byte{0xe}} + res, err := i.Allocate(AllocRequest{ + ContainerID: "c1", Namespace: "mail", Pod: "stalwart-0", WantV6: true, + IPAlgo: []embed.Field{embed.FieldNamespace, embed.FieldPod, embed.FieldImage}, + }) + if err != nil { + t.Fatalf("Allocate: %v", err) + } + // Last nibble from fakeRand NibbleN → 0xe. + if got := res.IP6[15] & 0x0F; got != 0xe { + t.Fatalf("last nibble = %x, want e", got) + } + mustNet(t, "2602:817:3000:f001::/64").Contains(res.IP6) +} + +func TestIPAM_AllocV6_CollisionRetry(t *testing.T) { + i, err := NewIPAM([]string{"2602:817:3000:f001::/64"}, nil) + if err != nil { + t.Fatal(err) + } + // Mark one specific address in-use, then feed the allocator that same + // address first and a new one on retry. + first := net.ParseIP("2602:817:3000:f001::1") + second := net.ParseIP("2602:817:3000:f001::2") + i.MarkInUse(first) + i.randSrc = &fakeRand{ + iids: [][]byte{ + append(make([]byte, 8), 0, 0, 0, 0, 0, 0, 0, 0x01), // -> ...:1 (collides) + append(make([]byte, 8), 0, 0, 0, 0, 0, 0, 0, 0x02), // -> ...:2 (ok) + }, + } + res, err := i.Allocate(AllocRequest{ + ContainerID: "c1", Namespace: "ns", Pod: "p", WantV6: true, + }) + if err != nil { + t.Fatalf("Allocate: %v", err) + } + if !res.IP6.Equal(second) { + t.Fatalf("IP6 = %s, want %s (collision retry)", res.IP6, second) + } +} + +func TestIPAM_AllocV4_SkipsNetworkAndGateway(t *testing.T) { + i, err := NewIPAM(nil, []string{"172.25.210.0/24"}) + if err != nil { + t.Fatal(err) + } + i.randSrc = &fakeRand{} + res, err := i.Allocate(AllocRequest{ + ContainerID: "c1", Namespace: "ns", Pod: "p", WantV4: true, + }) + if err != nil { + t.Fatalf("Allocate: %v", err) + } + if res.IP4.String() != "172.25.210.2" { + t.Fatalf("IP4 = %s, want 172.25.210.2 (skip .0 network + .1 gateway)", res.IP4) + } +} + +func TestIPAM_AllocV4_Sequential(t *testing.T) { + i, err := NewIPAM(nil, []string{"172.25.210.0/29"}) + if err != nil { + t.Fatal(err) + } + i.randSrc = &fakeRand{} + want := []string{"172.25.210.2", "172.25.210.3", "172.25.210.4", "172.25.210.5", "172.25.210.6"} + for _, w := range want { + res, err := i.Allocate(AllocRequest{ContainerID: w, WantV4: true}) + if err != nil { + t.Fatalf("Allocate: %v", err) + } + if res.IP4.String() != w { + t.Fatalf("got %s, want %s", res.IP4, w) + } + } + // Next allocation should fail — /29 exhausted (.0 network, .1 gateway, .7 broadcast). + if _, err := i.Allocate(AllocRequest{ContainerID: "extra", WantV4: true}); err == nil { + t.Fatalf("expected exhaustion error") + } +} + +func TestIPAM_DualStack(t *testing.T) { + i, err := NewIPAM( + []string{"2602:817:3000:f001::/64"}, + []string{"172.25.210.0/24"}, + ) + if err != nil { + t.Fatal(err) + } + i.randSrc = &fakeRand{ + iids: [][]byte{ + {0, 0, 0, 0, 0, 0, 0, 0, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0x11, 0x22}, + }, + } + res, err := i.Allocate(AllocRequest{ + ContainerID: "c1", WantV6: true, WantV4: true, + }) + if err != nil { + t.Fatalf("Allocate: %v", err) + } + if res.IP6 == nil || res.IP4 == nil { + t.Fatalf("dual-stack result missing IPs: %+v", res) + } + if res.IP4.String() != "172.25.210.2" { + t.Fatalf("IP4 = %s", res.IP4) + } +} + +func TestIPAM_RejectsNoFamily(t *testing.T) { + i, _ := NewIPAM([]string{"2602:817:3000:f001::/64"}, nil) + if _, err := i.Allocate(AllocRequest{ContainerID: "c"}); err == nil { + t.Fatalf("expected error for no v6/v4") + } +} + +func TestIPAM_Release(t *testing.T) { + i, _ := NewIPAM([]string{"2602:817:3000:f001::/64"}, nil) + i.randSrc = &fakeRand{ + iids: [][]byte{ + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}, + }, + } + res1, err := i.Allocate(AllocRequest{ContainerID: "c1", WantV6: true}) + if err != nil { + t.Fatal(err) + } + i.Release(res1.IP6) + // Next Allocate should be free to pick the same address again. + res2, err := i.Allocate(AllocRequest{ContainerID: "c2", WantV6: true}) + if err != nil { + t.Fatal(err) + } + if !res2.IP6.Equal(res1.IP6) { + t.Fatalf("expected release to free %s; got %s", res1.IP6, res2.IP6) + } +} + +func TestIPAM_RejectsMismatchedFamily(t *testing.T) { + if _, err := NewIPAM([]string{"10.0.0.0/24"}, nil); err == nil { + t.Fatalf("expected v4 in cidr6 to fail") + } + if _, err := NewIPAM(nil, []string{"2602:817:3000::/64"}); err == nil { + t.Fatalf("expected v6 in cidr4 to fail") + } +}